Skip to content

Commit 1e7b50e

Browse files
authored
Merge pull request #414 from kzys/jailer-cleanup
Remove a shim directory on StopVM() correctly and synchronously
2 parents 4081b0b + 78007d6 commit 1e7b50e

File tree

5 files changed

+148
-68
lines changed

5 files changed

+148
-68
lines changed

runtime/jailer_integ_test.go

Lines changed: 25 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,15 @@ package main
1515

1616
import (
1717
"context"
18+
"os"
19+
"path/filepath"
1820
"testing"
1921

2022
"github.com/containerd/containerd"
2123
"github.com/containerd/containerd/namespaces"
2224
"github.com/containerd/containerd/oci"
2325
"github.com/containerd/containerd/pkg/ttrpcutil"
26+
"github.com/stretchr/testify/assert"
2427
"github.com/stretchr/testify/require"
2528

2629
_ "github.com/firecracker-microvm/firecracker-containerd/firecracker-control"
@@ -78,24 +81,31 @@ func testJailer(t *testing.T, jailerConfig *proto.JailerConfig) {
7881
stdout := startAndWaitTask(ctx, t, c)
7982
require.Equal("hello", stdout)
8083

81-
defer func() {
82-
err := c.Delete(ctx, containerd.WithSnapshotCleanup)
83-
require.NoError(err, "failed to delete a container")
84-
}()
84+
stat, err := os.Stat(filepath.Join(shimBaseDir(), "default", vmID))
85+
require.NoError(err)
86+
assert.True(t, stat.IsDir())
87+
88+
err = c.Delete(ctx, containerd.WithSnapshotCleanup)
89+
require.NoError(err, "failed to delete a container")
90+
91+
_, err = fcClient.StopVM(ctx, &proto.StopVMRequest{VMID: vmID})
92+
require.NoError(err)
93+
94+
_, err = os.Stat(filepath.Join(shimBaseDir(), "default", vmID))
95+
assert.Error(t, err)
96+
assert.True(t, os.IsNotExist(err))
8597
}
8698

8799
func TestJailerCPUSet_Isolated(t *testing.T) {
88100
prepareIntegTest(t)
89101

90-
t.Run("TestJailerCPUSet_Isolated", func(t *testing.T) {
91-
b := cpuset.Builder{}
92-
cset := b.AddCPU(0).AddMem(0).Build()
93-
config := &proto.JailerConfig{
94-
CPUs: cset.CPUs(),
95-
Mems: cset.Mems(),
96-
UID: 300000,
97-
GID: 300000,
98-
}
99-
testJailer(t, config)
100-
})
102+
b := cpuset.Builder{}
103+
cset := b.AddCPU(0).AddMem(0).Build()
104+
config := &proto.JailerConfig{
105+
CPUs: cset.CPUs(),
106+
Mems: cset.Mems(),
107+
UID: 300000,
108+
GID: 300000,
109+
}
110+
testJailer(t, config)
101111
}

runtime/noop_jailer.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,4 +118,6 @@ func (j *noopJailer) Stop() error {
118118
return err
119119
}
120120

121-
func (j *noopJailer) Close() error { return nil }
121+
func (j *noopJailer) Close() error {
122+
return os.RemoveAll(j.shimDir.RootPath())
123+
}

runtime/runc_jailer.go

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,11 @@ import (
2323
"path/filepath"
2424
"strings"
2525
"syscall"
26+
"time"
2627

2728
"github.com/containerd/go-runc"
2829
"github.com/firecracker-microvm/firecracker-go-sdk"
30+
"github.com/hashicorp/go-multierror"
2931
"github.com/opencontainers/runtime-spec/specs-go"
3032
"github.com/pkg/errors"
3133
"github.com/sirupsen/logrus"
@@ -154,7 +156,7 @@ func (j *runcJailer) BuildJailedMachine(cfg *config.Config, machineConfig *firec
154156
func (j *runcJailer) BuildJailedRootHandler(cfg *config.Config, machineConfig *firecracker.Config, vmID string) firecracker.Handler {
155157
ociBundlePath := j.OCIBundlePath()
156158
rootPath := j.RootPath()
157-
machineConfig.SocketPath = filepath.Join(rootPath, "api.socket")
159+
machineConfig.SocketPath = filepath.Join(rootfsFolder, "api.socket")
158160

159161
return firecracker.Handler{
160162
Name: jailerHandlerName,
@@ -466,9 +468,18 @@ func (j *runcJailer) setDefaultConfigValues(cfg *config.Config, socketPath strin
466468
// Close will cleanup the container that may be left behind if the jailing
467469
// process was killed via SIGKILL.
468470
func (j *runcJailer) Close() error {
469-
return j.runcClient.Delete(j.ctx, j.vmID, &runc.DeleteOpts{
470-
Force: true,
471-
})
471+
// Even the jailer's associated context is cancelled,
472+
// we'd like to do the cleanups below just in case.
473+
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
474+
defer cancel()
475+
476+
// Delete the container, if it is still running.
477+
runcErr := j.runcClient.Delete(ctx, j.vmID, &runc.DeleteOpts{Force: true})
478+
479+
// Regardless of the result, remove the directory.
480+
removeErr := os.RemoveAll(j.OCIBundlePath())
481+
482+
return multierror.Append(runcErr, removeErr).ErrorOrNil()
472483
}
473484

474485
func mkdirAndChown(path string, mode os.FileMode, uid, gid uint32) error {

runtime/service.go

Lines changed: 47 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@ import (
2020
"math"
2121
"net"
2222
"os"
23-
"path/filepath"
2423
"runtime/debug"
2524
"strconv"
2625
"strings"
@@ -122,10 +121,7 @@ type service struct {
122121
config *config.Config
123122

124123
// vmReady is closed once CreateVM has been successfully called
125-
vmReady chan struct{}
126-
// vmExitErr is set with the error returned by machine.Wait(). It should
127-
// only be read after shimCtx.Done() is closed
128-
vmExitErr error
124+
vmReady chan struct{}
129125
vmStartOnce sync.Once
130126
agentClient taskAPI.TaskService
131127
eventBridgeClient eventbridge.Getter
@@ -135,6 +131,9 @@ type service struct {
135131
driveMountStubs []MountableStubDrive
136132
exitAfterAllTasksDeleted bool // exit the VM and shim when all tasks are deleted
137133

134+
cleanupErr error
135+
cleanupOnce sync.Once
136+
138137
machine *firecracker.Machine
139138
machineConfig *firecracker.Config
140139
vsockIOPortCount uint32
@@ -492,8 +491,18 @@ func (s *service) createVM(requestCtx context.Context, request *proto.CreateVMRe
492491
}
493492
}()
494493

494+
namespace, ok := namespaces.Namespace(s.shimCtx)
495+
if !ok {
496+
namespace = namespaces.Default
497+
}
498+
499+
dir, err := vm.ShimDir(s.config.ShimBaseDir, namespace, s.vmID)
500+
if err != nil {
501+
return err
502+
}
503+
495504
s.logger.Info("creating new VM")
496-
s.jailer, err = newJailer(s.shimCtx, s.logger, filepath.Join(s.config.ShimBaseDir, s.vmID), s, request)
505+
s.jailer, err = newJailer(s.shimCtx, s.logger, dir.RootPath(), s, request)
497506
if err != nil {
498507
return errors.Wrap(err, "failed to create jailer")
499508
}
@@ -574,7 +583,6 @@ func (s *service) StopVM(requestCtx context.Context, request *proto.StopVMReques
574583
timeout = time.Duration(request.TimeoutSeconds) * time.Second
575584
}
576585

577-
defer s.shimCancel()
578586
err = s.waitVMReady()
579587
if err != nil {
580588
return nil, err
@@ -1165,11 +1173,18 @@ func (s *service) shutdown(
11651173
s.shutdownLoop(requestCtx, timeout, req)
11661174
}()
11671175

1168-
err := s.machine.Wait(context.Background())
1169-
if err == nil {
1170-
return nil
1176+
var result *multierror.Error
1177+
if err := s.machine.Wait(context.Background()); err != nil {
1178+
result = multierror.Append(result, err)
1179+
}
1180+
if err := s.cleanup(); err != nil {
1181+
result = multierror.Append(result, err)
11711182
}
1172-
return status.Error(codes.Internal, fmt.Sprintf("the VMM was killed forcibly: %v", err))
1183+
1184+
if err := result.ErrorOrNil(); err != nil {
1185+
return status.Error(codes.Internal, fmt.Sprintf("the VMM was killed forcibly: %v", err))
1186+
}
1187+
return nil
11731188
}
11741189

11751190
// shutdownLoop sends multiple different shutdown requests to stop the VMM.
@@ -1204,13 +1219,6 @@ func (s *service) shutdownLoop(
12041219
},
12051220
timeout: jailerStopTimeout,
12061221
},
1207-
{
1208-
name: "cancel the context",
1209-
shutdown: func() error {
1210-
s.shimCancel()
1211-
return nil
1212-
},
1213-
},
12141222
}
12151223

12161224
for _, action := range actions {
@@ -1286,26 +1294,38 @@ func (s *service) Cleanup(requestCtx context.Context) (*taskAPI.DeleteResponse,
12861294
}, nil
12871295
}
12881296

1289-
func (s *service) monitorVMExit() {
1290-
defer func() {
1297+
// cleanup resources
1298+
func (s *service) cleanup() error {
1299+
s.cleanupOnce.Do(func() {
1300+
var result *multierror.Error
12911301
// we ignore the error here due to cleanup will only succeed if the jailing
12921302
// process was killed via SIGKILL
12931303
if err := s.jailer.Close(); err != nil {
1304+
result = multierror.Append(result, err)
12941305
s.logger.WithError(err).Error("failed to close jailer")
12951306
}
12961307

1308+
if err := s.publishVMStop(); err != nil {
1309+
result = multierror.Append(result, err)
1310+
s.logger.WithError(err).Error("failed to publish stop VM event")
1311+
}
1312+
12971313
// once the VM shuts down, the shim should too
12981314
s.shimCancel()
1299-
}()
13001315

1316+
s.cleanupErr = result.ErrorOrNil()
1317+
})
1318+
return s.cleanupErr
1319+
}
1320+
1321+
// monitorVMExit watches the VM and cleanup resources when it terminates.
1322+
func (s *service) monitorVMExit() {
13011323
// Block until the VM exits
1302-
s.vmExitErr = s.machine.Wait(s.shimCtx)
1303-
if s.vmExitErr != nil && s.vmExitErr != context.Canceled {
1304-
s.logger.WithError(s.vmExitErr).Error("error returned from VM wait")
1324+
if err := s.machine.Wait(s.shimCtx); err != nil && err != context.Canceled {
1325+
s.logger.WithError(err).Error("error returned from VM wait")
13051326
}
13061327

1307-
publishErr := s.publishVMStop()
1308-
if publishErr != nil {
1309-
s.logger.WithError(publishErr).Error("failed to publish stop VM event")
1328+
if err := s.cleanup(); err != nil {
1329+
s.logger.WithError(err).Error("failed to clean up the VM")
13101330
}
13111331
}

runtime/service_integ_test.go

Lines changed: 58 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -470,9 +470,12 @@ func testMultipleExecs(
470470
close(execStdouts)
471471

472472
if jailerConfig != nil {
473+
dir, err := vm.ShimDir(shimBaseDir(), "default", vmIDStr)
474+
require.NoError(t, err)
475+
473476
jailer := &runcJailer{
474477
Config: runcJailerConfig{
475-
OCIBundlePath: filepath.Join(shimBaseDir(), vmIDStr),
478+
OCIBundlePath: dir.RootPath(),
476479
},
477480
vmID: vmIDStr,
478481
}
@@ -580,35 +583,69 @@ func TestLongUnixSocketPath_Isolated(t *testing.T) {
580583
// default location we store state results in a path like
581584
// "/run/firecracker-containerd/default/<vmID>" (with len 112).
582585
const maxUnixSockLen = 108
583-
vmID := strings.Repeat("x", 76)
586+
vmID := strings.Repeat("x", 72)
584587

585588
ctx := namespaces.WithNamespace(context.Background(), "default")
586589

587590
pluginClient, err := ttrpcutil.NewClient(containerdSockPath + ".ttrpc")
588591
require.NoError(t, err, "failed to create ttrpc client")
589592

590-
fcClient := fccontrol.NewFirecrackerClient(pluginClient.Client())
591-
_, err = fcClient.CreateVM(ctx, &proto.CreateVMRequest{
592-
VMID: vmID,
593-
NetworkInterfaces: []*proto.FirecrackerNetworkInterface{},
594-
})
595-
require.NoError(t, err, "failed to create VM")
593+
subtests := []struct {
594+
name string
595+
request proto.CreateVMRequest
596+
}{
597+
{
598+
name: "Without Jailer",
599+
request: proto.CreateVMRequest{
600+
VMID: vmID + "noop",
601+
NetworkInterfaces: []*proto.FirecrackerNetworkInterface{},
602+
},
603+
},
604+
{
605+
name: "With Jailer",
606+
request: proto.CreateVMRequest{
607+
// We somehow cannot use the same VM ID here.
608+
// https://github.com/firecracker-microvm/firecracker-containerd/issues/409
609+
VMID: vmID + "jail",
610+
NetworkInterfaces: []*proto.FirecrackerNetworkInterface{},
611+
JailerConfig: &proto.JailerConfig{
612+
UID: 30000,
613+
GID: 30000,
614+
},
615+
},
616+
},
617+
}
596618

597-
// double-check that the sockets are at the expected path and that their absolute
598-
// length exceeds 108 bytes
599-
shimDir, err := vm.ShimDir(cfg.ShimBaseDir, "default", vmID)
600-
require.NoError(t, err, "failed to get shim dir")
619+
fcClient := fccontrol.NewFirecrackerClient(pluginClient.Client())
620+
for _, subtest := range subtests {
621+
request := subtest.request
622+
vmID := request.VMID
623+
t.Run(subtest.name, func(t *testing.T) {
624+
_, err = fcClient.CreateVM(ctx, &request)
625+
require.NoError(t, err, "failed to create VM")
626+
627+
// double-check that the sockets are at the expected path and that their absolute
628+
// length exceeds 108 bytes
629+
shimDir, err := vm.ShimDir(cfg.ShimBaseDir, "default", vmID)
630+
require.NoError(t, err, "failed to get shim dir")
631+
632+
if request.JailerConfig == nil {
633+
_, err = os.Stat(shimDir.FirecrackerSockPath())
634+
require.NoError(t, err, "failed to stat firecracker socket path")
635+
if len(shimDir.FirecrackerSockPath()) <= maxUnixSockLen {
636+
assert.Failf(t, "firecracker sock absolute path %q is not greater than max unix socket path length", shimDir.FirecrackerSockPath())
637+
}
601638

602-
_, err = os.Stat(shimDir.FirecrackerSockPath())
603-
require.NoError(t, err, "failed to stat firecracker socket path")
604-
if len(shimDir.FirecrackerSockPath()) <= maxUnixSockLen {
605-
assert.Failf(t, "firecracker sock absolute path %q is not greater than max unix socket path length", shimDir.FirecrackerSockPath())
606-
}
639+
_, err = os.Stat(shimDir.FirecrackerVSockPath())
640+
require.NoError(t, err, "failed to stat firecracker vsock path")
641+
if len(shimDir.FirecrackerVSockPath()) <= maxUnixSockLen {
642+
assert.Failf(t, "firecracker vsock absolute path %q is not greater than max unix socket path length", shimDir.FirecrackerVSockPath())
643+
}
644+
}
607645

608-
_, err = os.Stat(shimDir.FirecrackerVSockPath())
609-
require.NoError(t, err, "failed to stat firecracker vsock path")
610-
if len(shimDir.FirecrackerVSockPath()) <= maxUnixSockLen {
611-
assert.Failf(t, "firecracker vsock absolute path %q is not greater than max unix socket path length", shimDir.FirecrackerVSockPath())
646+
_, err = fcClient.StopVM(ctx, &proto.StopVMRequest{VMID: vmID})
647+
require.NoError(t, err)
648+
})
612649
}
613650
}
614651

0 commit comments

Comments
 (0)