Skip to content

Commit f235fa6

Browse files
authored
Merge pull request #3987 from cyphar/cloned-binary-rework
nsexec: cloned binary rework
2 parents 1d9b158 + 90c8d36 commit f235fa6

36 files changed

+1431
-664
lines changed

.github/workflows/test.yml

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,13 +28,18 @@ jobs:
2828
rootless: ["rootless", ""]
2929
race: ["-race", ""]
3030
criu: ["", "criu-dev"]
31+
dmz: ["", "runc_nodmz"]
3132
exclude:
3233
- criu: criu-dev
3334
rootless: rootless
3435
- criu: criu-dev
3536
go-version: 1.20.x
3637
- criu: criu-dev
3738
race: -race
39+
- dmz: runc_nodmz
40+
criu: criu-dev
41+
- dmz: runc_nodmz
42+
os: ubuntu-20.04
3843
runs-on: ${{ matrix.os }}
3944

4045
steps:
@@ -71,6 +76,8 @@ jobs:
7176
go-version: ${{ matrix.go-version }}
7277

7378
- name: build
79+
env:
80+
EXTRA_BUILDTAGS: ${{ matrix.dmz }}
7481
run: sudo -E PATH="$PATH" make EXTRA_FLAGS="${{ matrix.race }}" all
7582

7683
- name: install bats
@@ -80,6 +87,8 @@ jobs:
8087

8188
- name: unit test
8289
if: matrix.rootless != 'rootless'
90+
env:
91+
EXTRA_BUILDTAGS: ${{ matrix.dmz }}
8392
run: sudo -E PATH="$PATH" -- make TESTFLAGS="${{ matrix.race }}" localunittest
8493

8594
- name: add rootless user
@@ -113,8 +122,12 @@ jobs:
113122
# However, we do not have 32-bit ARM CI, so we use i386 for testing 32bit stuff.
114123
# We are not interested in providing official support for i386.
115124
cross-i386:
116-
runs-on: ubuntu-22.04
117125
timeout-minutes: 15
126+
strategy:
127+
fail-fast: false
128+
matrix:
129+
dmz: ["", "runc_nodmz"]
130+
runs-on: ubuntu-22.04
118131

119132
steps:
120133

@@ -136,4 +149,6 @@ jobs:
136149
go-version: 1.x # Latest stable
137150

138151
- name: unit test
152+
env:
153+
EXTRA_BUILDTAGS: ${{ matrix.dmz }}
139154
run: sudo -E PATH="$PATH" -- make GOARCH=386 localunittest

.gitignore

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
vendor/pkg
22
/runc
33
/runc-*
4-
contrib/cmd/recvtty/recvtty
5-
contrib/cmd/sd-helper/sd-helper
6-
contrib/cmd/seccompagent/seccompagent
7-
contrib/cmd/fs-idmap/fs-idmap
4+
/contrib/cmd/recvtty/recvtty
5+
/contrib/cmd/sd-helper/sd-helper
6+
/contrib/cmd/seccompagent/seccompagent
7+
/contrib/cmd/fs-idmap/fs-idmap
8+
/contrib/cmd/memfd-bind/memfd-bind
89
man/man8
910
release
1011
Vagrantfile

.golangci-extra.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
run:
88
build-tags:
99
- seccomp
10+
- runc_nodmz
1011

1112
linters:
1213
disable-all: true

.golangci.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
run:
44
build-tags:
55
- seccomp
6+
- runc_nodmz
67

78
linters:
89
enable:

Dockerfile

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9,19 +9,15 @@ ARG CRIU_REPO=https://download.opensuse.org/repositories/devel:/tools:/criu/Debi
99
RUN KEYFILE=/usr/share/keyrings/criu-repo-keyring.gpg; \
1010
wget -nv $CRIU_REPO/Release.key -O- | gpg --dearmor > "$KEYFILE" \
1111
&& echo "deb [signed-by=$KEYFILE] $CRIU_REPO/ /" > /etc/apt/sources.list.d/criu.list \
12+
&& dpkg --add-architecture i386 \
1213
&& apt-get update \
1314
&& apt-get install -y --no-install-recommends \
1415
build-essential \
1516
criu \
16-
gcc-aarch64-linux-gnu libc-dev-arm64-cross \
17-
gcc-arm-linux-gnueabi libc-dev-armel-cross \
18-
gcc-arm-linux-gnueabihf libc-dev-armhf-cross \
19-
gcc-powerpc64le-linux-gnu libc-dev-ppc64el-cross \
20-
gcc-s390x-linux-gnu libc-dev-s390x-cross \
21-
gcc-riscv64-linux-gnu libc-dev-riscv64-cross \
17+
gcc \
18+
gcc-multilib \
2219
curl \
2320
gawk \
24-
gcc \
2521
gperf \
2622
iptables \
2723
jq \
@@ -32,6 +28,14 @@ RUN KEYFILE=/usr/share/keyrings/criu-repo-keyring.gpg; \
3228
sudo \
3329
uidmap \
3430
iproute2 \
31+
&& apt-get install -y --no-install-recommends \
32+
libc-dev:i386 libgcc-s1:i386 \
33+
gcc-aarch64-linux-gnu libc-dev-arm64-cross \
34+
gcc-arm-linux-gnueabi libc-dev-armel-cross \
35+
gcc-arm-linux-gnueabihf libc-dev-armhf-cross \
36+
gcc-powerpc64le-linux-gnu libc-dev-ppc64el-cross \
37+
gcc-s390x-linux-gnu libc-dev-s390x-cross \
38+
gcc-riscv64-linux-gnu libc-dev-riscv64-cross \
3539
&& apt-get clean \
3640
&& rm -rf /var/cache/apt /var/lib/apt/lists/* /etc/apt/sources.list.d/*.list
3741

@@ -54,7 +58,7 @@ RUN cd /tmp \
5458
ARG LIBSECCOMP_VERSION
5559
COPY script/seccomp.sh script/lib.sh /tmp/script/
5660
RUN mkdir -p /opt/libseccomp \
57-
&& /tmp/script/seccomp.sh "$LIBSECCOMP_VERSION" /opt/libseccomp arm64 armel armhf ppc64le riscv64 s390x
61+
&& /tmp/script/seccomp.sh "$LIBSECCOMP_VERSION" /opt/libseccomp 386 amd64 arm64 armel armhf ppc64le riscv64 s390x
5862
ENV LIBSECCOMP_VERSION=$LIBSECCOMP_VERSION
5963
ENV LD_LIBRARY_PATH=/opt/libseccomp/lib
6064
ENV PKG_CONFIG_PATH=/opt/libseccomp/lib/pkgconfig

Makefile

Lines changed: 35 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,11 @@
1+
SHELL = /bin/bash
2+
13
CONTAINER_ENGINE := docker
24
GO ?= go
35

6+
# Get CC values for cross-compilation.
7+
include cc_platform.mk
8+
49
PREFIX ?= /usr/local
510
BINDIR := $(PREFIX)/sbin
611
MANDIR := $(PREFIX)/share/man
@@ -10,6 +15,7 @@ GIT_BRANCH_CLEAN := $(shell echo $(GIT_BRANCH) | sed -e "s/[^[:alnum:]]/-/g")
1015
RUNC_IMAGE := runc_dev$(if $(GIT_BRANCH_CLEAN),:$(GIT_BRANCH_CLEAN))
1116
PROJECT := github.com/opencontainers/runc
1217
BUILDTAGS ?= seccomp urfave_cli_no_docs
18+
BUILDTAGS += $(EXTRA_BUILDTAGS)
1319

1420
COMMIT ?= $(shell git describe --dirty --long --always)
1521
VERSION := $(shell cat ./VERSION)
@@ -57,18 +63,25 @@ endif
5763

5864
.DEFAULT: runc
5965

60-
runc:
66+
runc: runc-dmz
6167
$(GO_BUILD) -o runc .
68+
make verify-dmz-arch
6269

63-
all: runc recvtty sd-helper seccompagent fs-idmap
70+
all: runc recvtty sd-helper seccompagent fs-idmap memfd-bind
6471

65-
recvtty sd-helper seccompagent fs-idmap:
72+
recvtty sd-helper seccompagent fs-idmap memfd-bind:
6673
$(GO_BUILD) -o contrib/cmd/$@/$@ ./contrib/cmd/$@
6774

68-
static:
75+
static: runc-dmz
6976
$(GO_BUILD_STATIC) -o runc .
77+
make verify-dmz-arch
78+
79+
.PHONY: runc-dmz
80+
runc-dmz:
81+
rm -f libcontainer/dmz/runc-dmz
82+
$(GO) generate -tags "$(BUILDTAGS)" ./libcontainer/dmz
7083

71-
releaseall: RELEASE_ARGS := "-a arm64 -a armel -a armhf -a ppc64le -a riscv64 -a s390x"
84+
releaseall: RELEASE_ARGS := "-a 386 -a amd64 -a arm64 -a armel -a armhf -a ppc64le -a riscv64 -a s390x"
7285
releaseall: release
7386

7487
release: runcimage
@@ -147,12 +160,13 @@ install-man: man
147160
install -D -m 644 man/man8/*.8 $(DESTDIR)$(MANDIR)/man8
148161

149162
clean:
150-
rm -f runc runc-*
163+
rm -f runc runc-* libcontainer/dmz/runc-dmz
164+
rm -f contrib/cmd/fs-idmap/fs-idmap
151165
rm -f contrib/cmd/recvtty/recvtty
152166
rm -f contrib/cmd/sd-helper/sd-helper
153167
rm -f contrib/cmd/seccompagent/seccompagent
154-
rm -f contrib/cmd/fs-idmap/fs-idmap
155-
rm -rf release
168+
rm -f contrib/cmd/memfd-bind/memfd-bind
169+
sudo rm -rf release
156170
rm -rf man/man8
157171

158172
cfmt: C_SRC=$(shell git ls-files '*.c' | grep -v '^vendor/')
@@ -188,6 +202,18 @@ verify-dependencies: vendor
188202
@test -z "$$(git status --porcelain -- go.mod go.sum vendor/)" \
189203
|| (echo -e "git status:\n $$(git status -- go.mod go.sum vendor/)\nerror: vendor/, go.mod and/or go.sum not up to date. Run \"make vendor\" to update"; exit 1) \
190204
&& echo "all vendor files are up to date."
205+
verify-dmz-arch:
206+
@test -s libcontainer/dmz/runc-dmz || exit 0; \
207+
set -Eeuo pipefail; \
208+
export LC_ALL=C; \
209+
echo "readelf -h runc"; \
210+
readelf -h runc | grep -E "(Machine|Flags):"; \
211+
echo "readelf -h libcontainer/dmz/runc-dmz"; \
212+
readelf -h libcontainer/dmz/runc-dmz | grep -E "(Machine|Flags):"; \
213+
diff -u \
214+
<(readelf -h runc | grep -E "(Machine|Flags):") \
215+
<(readelf -h libcontainer/dmz/runc-dmz | grep -E "(Machine|Flags):") \
216+
&& echo "runc-dmz architecture matches runc binary."
191217

192218
validate-keyring:
193219
script/keyring_validate.sh
@@ -197,4 +223,4 @@ validate-keyring:
197223
test localtest unittest localunittest integration localintegration \
198224
rootlessintegration localrootlessintegration shell install install-bash \
199225
install-man clean cfmt shfmt localshfmt shellcheck \
200-
vendor verify-changelog verify-dependencies validate-keyring
226+
vendor verify-changelog verify-dependencies verify-dmz-arch validate-keyring

README.md

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -65,15 +65,18 @@ e.g. to disable seccomp:
6565
make BUILDTAGS=""
6666
```
6767

68-
| Build Tag | Feature | Enabled by default | Dependency |
69-
|-----------|------------------------------------|--------------------|------------|
70-
| seccomp | Syscall filtering | yes | libseccomp |
68+
| Build Tag | Feature | Enabled by Default | Dependencies |
69+
|---------------|---------------------------------------|--------------------|---------------------|
70+
| `seccomp` | Syscall filtering using `libseccomp`. | yes | `libseccomp` |
71+
| `!runc_nodmz` | Reduce memory usage for CVE-2019-5736 protection by using a small C binary, [see `memfd-bind` for more details][contrib-memfd-bind]. `runc_nodmz` disables this feature and causes runc to use a different protection mechanism which will further increases memory usage temporarily during container startup. This feature can also be disabled at runtime by setting the `RUNC_DMZ=legacy` environment variable. | yes ||
7172

7273
The following build tags were used earlier, but are now obsoleted:
7374
- **nokmem** (since runc v1.0.0-rc94 kernel memory settings are ignored)
7475
- **apparmor** (since runc v1.0.0-rc93 the feature is always enabled)
7576
- **selinux** (since runc v1.0.0-rc93 the feature is always enabled)
7677

78+
[contrib-memfd-bind]: /contrib/memfd-bind/README.md
79+
7780
### Running the test suite
7881

7982
`runc` currently supports running its test suite via Docker.

cc_platform.mk

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
# NOTE: Make sure you keep this file in sync with scripts/lib.sh.
2+
3+
GO ?= go
4+
GOARCH ?= $(shell $(GO) env GOARCH)
5+
6+
ifneq ($(shell grep -i "ID_LIKE=.*suse" /etc/os-release),)
7+
# openSUSE has a custom PLATFORM
8+
PLATFORM ?= suse-linux
9+
IS_SUSE := 1
10+
else
11+
PLATFORM ?= linux-gnu
12+
endif
13+
14+
ifeq ($(GOARCH),$(shell GOARCH= $(GO) env GOARCH))
15+
# use the native CC and STRIP
16+
HOST :=
17+
else ifeq ($(GOARCH),386)
18+
# Always use the 64-bit compiler to build the 386 binary, which works for
19+
# the more common cross-build method for x86 (namely, the equivalent of
20+
# dpkg --add-architecture).
21+
ifdef IS_SUSE
22+
# There is no x86_64-suse-linux-gcc, so use the native one.
23+
HOST :=
24+
CPU_TYPE := i586
25+
else
26+
HOST := x86_64-$(PLATFORM)-
27+
CPU_TYPE := i686
28+
endif
29+
CFLAGS := -m32 -march=$(CPU_TYPE) $(CFLAGS)
30+
else ifeq ($(GOARCH),amd64)
31+
ifdef IS_SUSE
32+
# There is no x86_64-suse-linux-gcc, so use the native one.
33+
HOST :=
34+
else
35+
HOST := x86_64-$(PLATFORM)-
36+
endif
37+
else ifeq ($(GOARCH),arm64)
38+
HOST := aarch64-$(PLATFORM)-
39+
else ifeq ($(GOARCH),arm)
40+
# HOST already configured by release_build.sh in this case.
41+
else ifeq ($(GOARCH),armel)
42+
HOST := arm-$(PLATFORM)eabi-
43+
else ifeq ($(GOARCH),armhf)
44+
HOST := arm-$(PLATFORM)eabihf-
45+
else ifeq ($(GOARCH),ppc64le)
46+
HOST := powerpc64le-$(PLATFORM)-
47+
else ifeq ($(GOARCH),riscv64)
48+
HOST := riscv64-$(PLATFORM)-
49+
else ifeq ($(GOARCH),s390x)
50+
HOST := s390x-$(PLATFORM)-
51+
else
52+
$(error Unsupported GOARCH $(GOARCH))
53+
endif
54+
55+
ifeq ($(origin CC),$(filter $(origin CC),undefined default))
56+
# Override CC if it's undefined or just the default value set by Make.
57+
CC := $(HOST)gcc
58+
export CC
59+
endif
60+
STRIP ?= $(HOST)strip
61+
export STRIP

contrib/cmd/memfd-bind/README.md

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
## memfd-bind ##
2+
3+
`runc` normally has to make a binary copy of itself (or of a smaller helper
4+
binary called `runc-dmz`) when constructing a container process in order to
5+
defend against certain container runtime attacks such as CVE-2019-5736.
6+
7+
This cloned binary only exists until the container process starts (this means
8+
for `runc run` and `runc exec`, it only exists for a few hundred milliseconds
9+
-- for `runc create` it exists until `runc start` is called). However, because
10+
the clone is done using a memfd (or by creating files in directories that are
11+
likely to be a `tmpfs`), this can lead to temporary increases in *host* memory
12+
usage. Unless you are running on a cgroupv1 system with the cgroupv1 memory
13+
controller enabled and the (deprecated) `memory.move_charge_at_immigrate`
14+
enabled, there is no effect on the container's memory.
15+
16+
However, for certain configurations this can still be undesirable. This daemon
17+
allows you to create a sealed memfd copy of the `runc` binary, which will cause
18+
`runc` to skip all binary copying, resulting in no additional memory usage for
19+
each container process (instead there is a single in-memory copy of the
20+
binary). It should be noted that (strictly speaking) this is slightly less
21+
secure if you are concerned about Dirty Cow-like 0-day kernel vulnerabilities,
22+
but for most users the security benefit is identical.
23+
24+
The provided `[email protected]` file can be used to get systemd to manage
25+
this daemon. You can supply the path like so:
26+
27+
```
28+
% systemctl start memfd-bind@/usr/bin/runc
29+
```
30+
31+
Thus, there are three ways of protecting against CVE-2019-5736, in order of how
32+
much memory usage they can use:
33+
34+
* `memfd-bind` only creates a single in-memory copy of the `runc` binary (about
35+
10MB), regardless of how many containers are running.
36+
37+
* `runc-dmz` is (depending on which libc it was compiled with) between 10kB and
38+
1MB in size, and a copy is created once per process spawned inside a
39+
container by runc (both the pid1 and every `runc exec`). There are
40+
circumstances where using `runc-dmz` will fail in ways that runc cannot
41+
predict ahead of time (such as restrictive LSMs applied to containers), in
42+
which case users can disable it with the `RUNC_DMZ=legacy` setting.
43+
`runc-dmz` also requires an additional `execve` over the other options,
44+
though since the binary is so small the cost is probably not even noticeable.
45+
46+
* The classic method of making a copy of the entire `runc` binary during
47+
container process setup takes up about 10MB per process spawned inside the
48+
container by runc (both pid1 and `runc exec`).
49+
50+
### Caveats ###
51+
52+
There are several downsides with using `memfd-bind` on the `runc` binary:
53+
54+
* The `memfd-bind` process needs to continue to run indefinitely in order for
55+
the memfd reference to stay alive. If the process is forcefully killed, the
56+
bind-mount on top of the `runc` binary will become stale and nobody will be
57+
able to execute it (you can use `memfd-bind --cleanup` to clean up the stale
58+
mount).
59+
60+
* Only root can execute the cloned binary due to permission restrictions on
61+
accessing other process's files. More specifically, only users with ptrace
62+
privileges over the memfd-bind daemon can access the file (but in practice
63+
this is usually only root).
64+
65+
* When updating `runc`, the daemon needs to be stopped before the update (so
66+
the package manager can access the underlying file) and then restarted after
67+
the update.

0 commit comments

Comments
 (0)