Skip to content

Commit dac4171

Browse files
lifubangcyphar
andcommitted
runc-dmz: reduce memfd binary cloning cost with small C binary
The idea is to remove the need for cloning the entire runc binary by replacing the final execve() call of the container process with an execve() call to a clone of a small C binary which just does an execve() of its arguments. This provides similar protection against CVE-2019-5736 but without requiring a >10MB binary copy for each "runc init". When compiled with musl, runc-dmz is 13kB (though unfortunately with glibc, it is 1.1MB which is still quite large). It should be noted that there is still a window where the container processes could get access to the host runc binary, but because we set ourselves as non-dumpable the container would need CAP_SYS_PTRACE (which is not enabled by default in Docker) in order to get around the proc_fd_access_allowed() checks. In addition, since Linux 4.10[1] the kernel blocks access entirely for user namespaced containers in this scenario. For those cases we cannot use runc-dmz, but most containers won't have this issue. This new runc-dmz binary can be opted out of at compile time by setting the "runc_nodmz" buildtag, and at runtime by setting the RUNC_DMZ=legacy environment variable. In both cases, runc will fall back to the classic /proc/self/exe-based cloning trick. If /proc/self/exe is already a sealed memfd (namely if the user is using contrib/cmd/memfd-bind to create a persistent sealed memfd for runc), neither runc-dmz nor /proc/self/exe cloning will be used because they are not necessary. [1]: torvalds/linux@bfedb58 Co-authored-by: lifubang <[email protected]> Signed-off-by: lifubang <[email protected]> [cyphar: address various review nits] [cyphar: fix runc-dmz cross-compilation] [cyphar: embed runc-dmz into runc binary and clone in Go code] [cyphar: make runc-dmz optional, with fallback to /proc/self/exe cloning] [cyphar: do not use runc-dmz when the container has certain privs] Co-authored-by: Aleksa Sarai <[email protected]> Signed-off-by: Aleksa Sarai <[email protected]>
1 parent e089db3 commit dac4171

20 files changed

+608
-25
lines changed

.github/workflows/test.yml

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,13 +28,18 @@ jobs:
2828
rootless: ["rootless", ""]
2929
race: ["-race", ""]
3030
criu: ["", "criu-dev"]
31+
dmz: ["", "runc_nodmz"]
3132
exclude:
3233
- criu: criu-dev
3334
rootless: rootless
3435
- criu: criu-dev
3536
go-version: 1.20.x
3637
- criu: criu-dev
3738
race: -race
39+
- dmz: runc_nodmz
40+
criu: criu-dev
41+
- dmz: runc_nodmz
42+
os: ubuntu-20.04
3843
runs-on: ${{ matrix.os }}
3944

4045
steps:
@@ -71,6 +76,8 @@ jobs:
7176
go-version: ${{ matrix.go-version }}
7277

7378
- name: build
79+
env:
80+
EXTRA_BUILDTAGS: ${{ matrix.dmz }}
7481
run: sudo -E PATH="$PATH" make EXTRA_FLAGS="${{ matrix.race }}" all
7582

7683
- name: install bats
@@ -80,6 +87,8 @@ jobs:
8087

8188
- name: unit test
8289
if: matrix.rootless != 'rootless'
90+
env:
91+
EXTRA_BUILDTAGS: ${{ matrix.dmz }}
8392
run: sudo -E PATH="$PATH" -- make TESTFLAGS="${{ matrix.race }}" localunittest
8493

8594
- name: add rootless user
@@ -113,8 +122,12 @@ jobs:
113122
# However, we do not have 32-bit ARM CI, so we use i386 for testing 32bit stuff.
114123
# We are not interested in providing official support for i386.
115124
cross-i386:
116-
runs-on: ubuntu-22.04
117125
timeout-minutes: 15
126+
strategy:
127+
fail-fast: false
128+
matrix:
129+
dmz: ["", "runc_nodmz"]
130+
runs-on: ubuntu-22.04
118131

119132
steps:
120133

@@ -136,4 +149,6 @@ jobs:
136149
go-version: 1.x # Latest stable
137150

138151
- name: unit test
152+
env:
153+
EXTRA_BUILDTAGS: ${{ matrix.dmz }}
139154
run: sudo -E PATH="$PATH" -- make GOARCH=386 localunittest

.golangci-extra.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
run:
88
build-tags:
99
- seccomp
10+
- runc_nodmz
1011

1112
linters:
1213
disable-all: true

.golangci.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
run:
44
build-tags:
55
- seccomp
6+
- runc_nodmz
67

78
linters:
89
enable:

Makefile

Lines changed: 30 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,11 @@
1+
SHELL = /bin/bash
2+
13
CONTAINER_ENGINE := docker
24
GO ?= go
35

6+
# Get CC values for cross-compilation.
7+
include cc_platform.mk
8+
49
PREFIX ?= /usr/local
510
BINDIR := $(PREFIX)/sbin
611
MANDIR := $(PREFIX)/share/man
@@ -10,6 +15,7 @@ GIT_BRANCH_CLEAN := $(shell echo $(GIT_BRANCH) | sed -e "s/[^[:alnum:]]/-/g")
1015
RUNC_IMAGE := runc_dev$(if $(GIT_BRANCH_CLEAN),:$(GIT_BRANCH_CLEAN))
1116
PROJECT := github.com/opencontainers/runc
1217
BUILDTAGS ?= seccomp urfave_cli_no_docs
18+
BUILDTAGS += $(EXTRA_BUILDTAGS)
1319

1420
COMMIT ?= $(shell git describe --dirty --long --always)
1521
VERSION := $(shell cat ./VERSION)
@@ -57,16 +63,23 @@ endif
5763

5864
.DEFAULT: runc
5965

60-
runc:
66+
runc: runc-dmz
6167
$(GO_BUILD) -o runc .
68+
make verify-dmz-arch
6269

6370
all: runc recvtty sd-helper seccompagent fs-idmap
6471

6572
recvtty sd-helper seccompagent fs-idmap:
6673
$(GO_BUILD) -o contrib/cmd/$@/$@ ./contrib/cmd/$@
6774

68-
static:
75+
static: runc-dmz
6976
$(GO_BUILD_STATIC) -o runc .
77+
make verify-dmz-arch
78+
79+
.PHONY: runc-dmz
80+
runc-dmz:
81+
rm -f libcontainer/dmz/runc-dmz
82+
$(GO) generate -tags "$(BUILDTAGS)" ./libcontainer/dmz
7083

7184
releaseall: RELEASE_ARGS := "-a 386 -a amd64 -a arm64 -a armel -a armhf -a ppc64le -a riscv64 -a s390x"
7285
releaseall: release
@@ -147,12 +160,12 @@ install-man: man
147160
install -D -m 644 man/man8/*.8 $(DESTDIR)$(MANDIR)/man8
148161

149162
clean:
150-
rm -f runc runc-*
163+
rm -f runc runc-* libcontainer/dmz/runc-dmz
151164
rm -f contrib/cmd/recvtty/recvtty
152165
rm -f contrib/cmd/sd-helper/sd-helper
153166
rm -f contrib/cmd/seccompagent/seccompagent
154167
rm -f contrib/cmd/fs-idmap/fs-idmap
155-
rm -rf release
168+
sudo rm -rf release
156169
rm -rf man/man8
157170

158171
cfmt: C_SRC=$(shell git ls-files '*.c' | grep -v '^vendor/')
@@ -188,6 +201,18 @@ verify-dependencies: vendor
188201
@test -z "$$(git status --porcelain -- go.mod go.sum vendor/)" \
189202
|| (echo -e "git status:\n $$(git status -- go.mod go.sum vendor/)\nerror: vendor/, go.mod and/or go.sum not up to date. Run \"make vendor\" to update"; exit 1) \
190203
&& echo "all vendor files are up to date."
204+
verify-dmz-arch:
205+
@test -s libcontainer/dmz/runc-dmz || exit 0; \
206+
set -Eeuo pipefail; \
207+
export LC_ALL=C; \
208+
echo "readelf -h runc"; \
209+
readelf -h runc | grep -E "(Machine|Flags):"; \
210+
echo "readelf -h libcontainer/dmz/runc-dmz"; \
211+
readelf -h libcontainer/dmz/runc-dmz | grep -E "(Machine|Flags):"; \
212+
diff -u \
213+
<(readelf -h runc | grep -E "(Machine|Flags):") \
214+
<(readelf -h libcontainer/dmz/runc-dmz | grep -E "(Machine|Flags):") \
215+
&& echo "runc-dmz architecture matches runc binary."
191216

192217
validate-keyring:
193218
script/keyring_validate.sh
@@ -197,4 +222,4 @@ validate-keyring:
197222
test localtest unittest localunittest integration localintegration \
198223
rootlessintegration localrootlessintegration shell install install-bash \
199224
install-man clean cfmt shfmt localshfmt shellcheck \
200-
vendor verify-changelog verify-dependencies validate-keyring
225+
vendor verify-changelog verify-dependencies verify-dmz-arch validate-keyring

README.md

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -65,9 +65,10 @@ e.g. to disable seccomp:
6565
make BUILDTAGS=""
6666
```
6767

68-
| Build Tag | Feature | Enabled by default | Dependency |
69-
|-----------|------------------------------------|--------------------|------------|
70-
| seccomp | Syscall filtering | yes | libseccomp |
68+
| Build Tag | Feature | Enabled by Default | Dependencies |
69+
|---------------|---------------------------------------|--------------------|---------------------|
70+
| `seccomp` | Syscall filtering using `libseccomp`. | yes | `libseccomp` |
71+
| `!runc_nodmz` | Reduce memory usage for CVE-2019-5736 protection by using a small C binary. `runc_nodmz` disables this feature and causes runc to use a different protection mechanism which will further increases memory usage temporarily during container startup. This feature can also be disabled at runtime by setting the `RUNC_DMZ=legacy` environment variable. | yes ||
7172

7273
The following build tags were used earlier, but are now obsoleted:
7374
- **nokmem** (since runc v1.0.0-rc94 kernel memory settings are ignored)

cc_platform.mk

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
# NOTE: Make sure you keep this file in sync with scripts/lib.sh.
2+
3+
GO ?= go
4+
GOARCH ?= $(shell $(GO) env GOARCH)
5+
6+
ifneq ($(shell grep -i "ID_LIKE=.*suse" /etc/os-release),)
7+
# openSUSE has a custom PLATFORM
8+
PLATFORM ?= suse-linux
9+
IS_SUSE := 1
10+
else
11+
PLATFORM ?= linux-gnu
12+
endif
13+
14+
ifeq ($(GOARCH),$(shell GOARCH= $(GO) env GOARCH))
15+
# use the native CC and STRIP
16+
HOST :=
17+
else ifeq ($(GOARCH),386)
18+
# Always use the 64-bit compiler to build the 386 binary, which works for
19+
# the more common cross-build method for x86 (namely, the equivalent of
20+
# dpkg --add-architecture).
21+
ifdef IS_SUSE
22+
# There is no x86_64-suse-linux-gcc, so use the native one.
23+
HOST :=
24+
CPU_TYPE := i586
25+
else
26+
HOST := x86_64-$(PLATFORM)-
27+
CPU_TYPE := i686
28+
endif
29+
CFLAGS := -m32 -march=$(CPU_TYPE) $(CFLAGS)
30+
else ifeq ($(GOARCH),amd64)
31+
ifdef IS_SUSE
32+
# There is no x86_64-suse-linux-gcc, so use the native one.
33+
HOST :=
34+
else
35+
HOST := x86_64-$(PLATFORM)-
36+
endif
37+
else ifeq ($(GOARCH),arm64)
38+
HOST := aarch64-$(PLATFORM)-
39+
else ifeq ($(GOARCH),arm)
40+
# HOST already configured by release_build.sh in this case.
41+
else ifeq ($(GOARCH),armel)
42+
HOST := arm-$(PLATFORM)eabi-
43+
else ifeq ($(GOARCH),armhf)
44+
HOST := arm-$(PLATFORM)eabihf-
45+
else ifeq ($(GOARCH),ppc64le)
46+
HOST := powerpc64le-$(PLATFORM)-
47+
else ifeq ($(GOARCH),riscv64)
48+
HOST := riscv64-$(PLATFORM)-
49+
else ifeq ($(GOARCH),s390x)
50+
HOST := s390x-$(PLATFORM)-
51+
else
52+
$(error Unsupported GOARCH $(GOARCH))
53+
endif
54+
55+
ifeq ($(origin CC),$(filter $(origin CC),undefined default))
56+
# Override CC if it's undefined or just the default value set by Make.
57+
CC := $(HOST)gcc
58+
export CC
59+
endif
60+
STRIP ?= $(HOST)strip
61+
export STRIP

libcontainer/container_linux.go

Lines changed: 87 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ import (
2727
"github.com/opencontainers/runc/libcontainer/dmz"
2828
"github.com/opencontainers/runc/libcontainer/intelrdt"
2929
"github.com/opencontainers/runc/libcontainer/system"
30+
"github.com/opencontainers/runc/libcontainer/system/kernelversion"
3031
"github.com/opencontainers/runc/libcontainer/utils"
3132
)
3233

@@ -444,6 +445,48 @@ func (c *Container) includeExecFifo(cmd *exec.Cmd) error {
444445
return nil
445446
}
446447

448+
// No longer needed in Go 1.21.
449+
func slicesContains[S ~[]E, E comparable](slice S, needle E) bool {
450+
for _, val := range slice {
451+
if val == needle {
452+
return true
453+
}
454+
}
455+
return false
456+
}
457+
458+
func isDmzBinarySafe(c *configs.Config) bool {
459+
// Because we set the dumpable flag in nsexec, the only time when it is
460+
// unsafe to use runc-dmz is when the container process would be able to
461+
// race against "runc init" and bypass the ptrace_may_access() checks.
462+
//
463+
// This is only the case if the container processes could have
464+
// CAP_SYS_PTRACE somehow (i.e. the capability is present in the bounding,
465+
// inheritable, or ambient sets). Luckily, most containers do not have this
466+
// capability.
467+
if c.Capabilities == nil ||
468+
(!slicesContains(c.Capabilities.Bounding, "CAP_SYS_PTRACE") &&
469+
!slicesContains(c.Capabilities.Inheritable, "CAP_SYS_PTRACE") &&
470+
!slicesContains(c.Capabilities.Ambient, "CAP_SYS_PTRACE")) {
471+
return true
472+
}
473+
474+
// Since Linux 4.10 (see bfedb589252c0) user namespaced containers cannot
475+
// access /proc/$pid/exe of runc after it joins the namespace (until it
476+
// does an exec), regardless of the capability set. This has been
477+
// backported to other distribution kernels, but there's no way of checking
478+
// this cheaply -- better to be safe than sorry here.
479+
linux410 := kernelversion.KernelVersion{Kernel: 4, Major: 10}
480+
if ok, err := kernelversion.GreaterEqualThan(linux410); ok && err == nil {
481+
if c.Namespaces.Contains(configs.NEWUSER) {
482+
return true
483+
}
484+
}
485+
486+
// Assume it's unsafe otherwise.
487+
return false
488+
}
489+
447490
func (c *Container) newParentProcess(p *Process) (parentProcess, error) {
448491
parentInitPipe, childInitPipe, err := utils.NewSockPair("init")
449492
if err != nil {
@@ -457,27 +500,54 @@ func (c *Container) newParentProcess(p *Process) (parentProcess, error) {
457500
}
458501
logFilePair := filePair{parentLogPipe, childLogPipe}
459502

460-
// Make sure we use a new safe copy of /proc/self/exe each time this is
461-
// called, to make sure that if a container manages to overwrite the file
462-
// it cannot affect other containers on the system. For runc, this code
463-
// will only ever be called once, but libcontainer users might call this
464-
// more than once.
503+
// Make sure we use a new safe copy of /proc/self/exe or the runc-dmz
504+
// binary each time this is called, to make sure that if a container
505+
// manages to overwrite the file it cannot affect other containers on the
506+
// system. For runc, this code will only ever be called once, but
507+
// libcontainer users might call this more than once.
465508
p.closeClonedExes()
466509
var (
467510
exePath string
468-
safeExe *os.File
511+
// only one of dmzExe or safeExe are used at a time
512+
dmzExe, safeExe *os.File
469513
)
470514
if dmz.IsSelfExeCloned() {
471515
// /proc/self/exe is already a cloned binary -- no need to do anything
472516
logrus.Debug("skipping binary cloning -- /proc/self/exe is already cloned!")
473517
exePath = "/proc/self/exe"
474518
} else {
475-
safeExe, err = dmz.CloneSelfExe(c.root)
476-
if err != nil {
477-
return nil, fmt.Errorf("unable to create safe /proc/self/exe clone for runc init: %w", err)
519+
var err error
520+
if isDmzBinarySafe(c.config) {
521+
dmzExe, err = dmz.Binary(c.root)
522+
if err == nil {
523+
// We can use our own executable without cloning if we are using
524+
// runc-dmz.
525+
exePath = "/proc/self/exe"
526+
p.clonedExes = append(p.clonedExes, dmzExe)
527+
} else if errors.Is(err, dmz.ErrNoDmzBinary) {
528+
logrus.Debug("runc-dmz binary not embedded in runc binary, falling back to /proc/self/exe clone")
529+
} else if err != nil {
530+
return nil, fmt.Errorf("failed to create runc-dmz binary clone: %w", err)
531+
}
532+
} else {
533+
// If the configuration makes it unsafe to use runc-dmz, pretend we
534+
// don't have it embedded so we do /proc/self/exe cloning.
535+
logrus.Debug("container configuration unsafe for runc-dmz, falling back to /proc/self/exe clone")
536+
err = dmz.ErrNoDmzBinary
537+
}
538+
if errors.Is(err, dmz.ErrNoDmzBinary) {
539+
safeExe, err = dmz.CloneSelfExe(c.root)
540+
if err != nil {
541+
return nil, fmt.Errorf("unable to create safe /proc/self/exe clone for runc init: %w", err)
542+
}
543+
exePath = "/proc/self/fd/" + strconv.Itoa(int(safeExe.Fd()))
544+
p.clonedExes = append(p.clonedExes, safeExe)
545+
}
546+
// Just to make sure we don't run without protection.
547+
if dmzExe == nil && safeExe == nil {
548+
// This should never happen.
549+
return nil, fmt.Errorf("[internal error] attempted to spawn a container with no /proc/self/exe protection")
478550
}
479-
exePath = "/proc/self/fd/" + strconv.Itoa(int(safeExe.Fd()))
480-
p.clonedExes = append(p.clonedExes, safeExe)
481551
}
482552

483553
cmd := exec.Command(exePath, "init")
@@ -503,6 +573,12 @@ func (c *Container) newParentProcess(p *Process) (parentProcess, error) {
503573
"_LIBCONTAINER_STATEDIR="+c.root,
504574
)
505575

576+
if dmzExe != nil {
577+
cmd.ExtraFiles = append(cmd.ExtraFiles, dmzExe)
578+
cmd.Env = append(cmd.Env,
579+
"_LIBCONTAINER_DMZEXEFD="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1))
580+
}
581+
506582
cmd.ExtraFiles = append(cmd.ExtraFiles, childLogPipe)
507583
cmd.Env = append(cmd.Env,
508584
"_LIBCONTAINER_LOGPIPE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1))

libcontainer/dmz/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
/runc-dmz

libcontainer/dmz/Makefile

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
# Get CC values for cross-compilation.
2+
include ../../cc_platform.mk
3+
4+
runc-dmz: _dmz.c
5+
$(CC) $(CFLAGS) -static -o $@ $^
6+
$(STRIP) -gs $@

libcontainer/dmz/_dmz.c

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
#include <unistd.h>
2+
3+
extern char **environ;
4+
5+
int main(int argc, char **argv)
6+
{
7+
if (argc < 1)
8+
return 127;
9+
return execve(argv[0], argv, environ);
10+
}

0 commit comments

Comments
 (0)