opencontainers
diff --git a/‎.github/workflows/test.yml‎
Lines changed: 16 additions & 1 deletion b/‎.github/workflows/test.yml‎
Lines changed: 16 additions & 1 deletion
diff --git a/‎.gitignore‎
Lines changed: 5 additions & 4 deletions b/‎.gitignore‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎.golangci-extra.yml‎
Lines changed: 1 addition & 0 deletions b/‎.golangci-extra.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.golangci.yml‎
Lines changed: 1 addition & 0 deletions b/‎.golangci.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎Dockerfile‎
Lines changed: 12 additions & 8 deletions b/‎Dockerfile‎
Lines changed: 12 additions & 8 deletions
diff --git a/‎Makefile‎
Lines changed: 35 additions & 9 deletions b/‎Makefile‎
Lines changed: 35 additions & 9 deletions
diff --git a/‎README.md‎
Lines changed: 6 additions & 3 deletions b/‎README.md‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎cc_platform.mk‎
Lines changed: 61 additions & 0 deletions b/‎cc_platform.mk‎
Lines changed: 61 additions & 0 deletions
diff --git a/‎contrib/cmd/memfd-bind/README.md‎
Lines changed: 67 additions & 0 deletions b/‎contrib/cmd/memfd-bind/README.md‎
Lines changed: 67 additions & 0 deletions
@@ -28,13 +28,18 @@ jobs:
         rootless: ["rootless", ""]
         race: ["-race", ""]
         criu: ["", "criu-dev"]
+        dmz: ["", "runc_nodmz"]
         exclude:
           - criu: criu-dev
             rootless: rootless
           - criu: criu-dev
             go-version: 1.20.x
           - criu: criu-dev
             race: -race
+          - dmz: runc_nodmz
+            criu: criu-dev
+          - dmz: runc_nodmz
+            os: ubuntu-20.04
     runs-on: ${{ matrix.os }}
 
     steps:
@@ -71,6 +76,8 @@ jobs:
         go-version: ${{ matrix.go-version }}
 
     - name: build
+      env:
+        EXTRA_BUILDTAGS: ${{ matrix.dmz }}
       run: sudo -E PATH="$PATH" make EXTRA_FLAGS="${{ matrix.race }}" all
 
     - name: install bats
@@ -80,6 +87,8 @@ jobs:
 
     - name: unit test
       if: matrix.rootless != 'rootless'
+      env:
+        EXTRA_BUILDTAGS: ${{ matrix.dmz }}
       run: sudo -E PATH="$PATH" -- make TESTFLAGS="${{ matrix.race }}" localunittest
 
     - name: add rootless user
@@ -113,8 +122,12 @@ jobs:
   # However, we do not have 32-bit ARM CI, so we use i386 for testing 32bit stuff.
   # We are not interested in providing official support for i386.
   cross-i386:
-    runs-on: ubuntu-22.04
     timeout-minutes: 15
+    strategy:
+      fail-fast: false
+      matrix:
+        dmz: ["", "runc_nodmz"]
+    runs-on: ubuntu-22.04
 
     steps:
 
@@ -136,4 +149,6 @@ jobs:
         go-version: 1.x # Latest stable
 
     - name: unit test
+      env:
+        EXTRA_BUILDTAGS: ${{ matrix.dmz }}
       run: sudo -E PATH="$PATH" -- make GOARCH=386 localunittest
@@ -1,10 +1,11 @@
 vendor/pkg
 /runc
 /runc-*
-contrib/cmd/recvtty/recvtty
-contrib/cmd/sd-helper/sd-helper
-contrib/cmd/seccompagent/seccompagent
-contrib/cmd/fs-idmap/fs-idmap
+/contrib/cmd/recvtty/recvtty
+/contrib/cmd/sd-helper/sd-helper
+/contrib/cmd/seccompagent/seccompagent
+/contrib/cmd/fs-idmap/fs-idmap
+/contrib/cmd/memfd-bind/memfd-bind
 man/man8
 release
 Vagrantfile
 
@@ -7,6 +7,7 @@
 run:
   build-tags:
     - seccomp
+    - runc_nodmz
 
 linters:
   disable-all: true
 
@@ -3,6 +3,7 @@
 run:
   build-tags:
     - seccomp
+    - runc_nodmz
 
 linters:
   enable:
 
@@ -9,19 +9,15 @@ ARG CRIU_REPO=https://download.opensuse.org/repositories/devel:/tools:/criu/Debi
 RUN KEYFILE=/usr/share/keyrings/criu-repo-keyring.gpg; \
     wget -nv $CRIU_REPO/Release.key -O- | gpg --dearmor > "$KEYFILE" \
     && echo "deb [signed-by=$KEYFILE] $CRIU_REPO/ /" > /etc/apt/sources.list.d/criu.list \
+    && dpkg --add-architecture i386 \
     && apt-get update \
     && apt-get install -y --no-install-recommends \
         build-essential \
         criu \
-        gcc-aarch64-linux-gnu libc-dev-arm64-cross \
-        gcc-arm-linux-gnueabi libc-dev-armel-cross \
-        gcc-arm-linux-gnueabihf libc-dev-armhf-cross \
-        gcc-powerpc64le-linux-gnu libc-dev-ppc64el-cross \
-        gcc-s390x-linux-gnu libc-dev-s390x-cross \
-        gcc-riscv64-linux-gnu libc-dev-riscv64-cross \
+        gcc \
+        gcc-multilib \
         curl \
         gawk \
-        gcc \
         gperf \
         iptables \
         jq \
@@ -32,6 +28,14 @@ RUN KEYFILE=/usr/share/keyrings/criu-repo-keyring.gpg; \
         sudo \
         uidmap \
         iproute2 \
+    && apt-get install -y --no-install-recommends \
+        libc-dev:i386 libgcc-s1:i386 \
+        gcc-aarch64-linux-gnu libc-dev-arm64-cross \
+        gcc-arm-linux-gnueabi libc-dev-armel-cross \
+        gcc-arm-linux-gnueabihf libc-dev-armhf-cross \
+        gcc-powerpc64le-linux-gnu libc-dev-ppc64el-cross \
+        gcc-s390x-linux-gnu libc-dev-s390x-cross \
+        gcc-riscv64-linux-gnu libc-dev-riscv64-cross \
     && apt-get clean \
     && rm -rf /var/cache/apt /var/lib/apt/lists/* /etc/apt/sources.list.d/*.list
 
@@ -54,7 +58,7 @@ RUN cd /tmp \
 ARG LIBSECCOMP_VERSION
 COPY script/seccomp.sh script/lib.sh /tmp/script/
 RUN mkdir -p /opt/libseccomp \
-    && /tmp/script/seccomp.sh "$LIBSECCOMP_VERSION" /opt/libseccomp arm64 armel armhf ppc64le riscv64 s390x
+    && /tmp/script/seccomp.sh "$LIBSECCOMP_VERSION" /opt/libseccomp 386 amd64 arm64 armel armhf ppc64le riscv64 s390x
 ENV LIBSECCOMP_VERSION=$LIBSECCOMP_VERSION
 ENV LD_LIBRARY_PATH=/opt/libseccomp/lib
 ENV PKG_CONFIG_PATH=/opt/libseccomp/lib/pkgconfig
 
@@ -1,6 +1,11 @@
+SHELL = /bin/bash
+
 CONTAINER_ENGINE := docker
 GO ?= go
 
+# Get CC values for cross-compilation.
+include cc_platform.mk
+
 PREFIX ?= /usr/local
 BINDIR := $(PREFIX)/sbin
 MANDIR := $(PREFIX)/share/man
@@ -10,6 +15,7 @@ GIT_BRANCH_CLEAN := $(shell echo $(GIT_BRANCH) | sed -e "s/[^[:alnum:]]/-/g")
 RUNC_IMAGE := runc_dev$(if $(GIT_BRANCH_CLEAN),:$(GIT_BRANCH_CLEAN))
 PROJECT := github.com/opencontainers/runc
 BUILDTAGS ?= seccomp urfave_cli_no_docs
+BUILDTAGS += $(EXTRA_BUILDTAGS)
 
 COMMIT ?= $(shell git describe --dirty --long --always)
 VERSION := $(shell cat ./VERSION)
@@ -57,18 +63,25 @@ endif
 
 .DEFAULT: runc
 
-runc:
+runc: runc-dmz
 	$(GO_BUILD) -o runc .
+	make verify-dmz-arch
 
-all: runc recvtty sd-helper seccompagent fs-idmap
+all: runc recvtty sd-helper seccompagent fs-idmap memfd-bind
 
-recvtty sd-helper seccompagent fs-idmap:
+recvtty sd-helper seccompagent fs-idmap memfd-bind:
 	$(GO_BUILD) -o contrib/cmd/$@/$@ ./contrib/cmd/$@
 
-static:
+static: runc-dmz
 	$(GO_BUILD_STATIC) -o runc .
+	make verify-dmz-arch
+
+.PHONY: runc-dmz
+runc-dmz:
+	rm -f libcontainer/dmz/runc-dmz
+	$(GO) generate -tags "$(BUILDTAGS)" ./libcontainer/dmz
 
-releaseall: RELEASE_ARGS := "-a arm64 -a armel -a armhf -a ppc64le -a riscv64 -a s390x"
+releaseall: RELEASE_ARGS := "-a 386 -a amd64 -a arm64 -a armel -a armhf -a ppc64le -a riscv64 -a s390x"
 releaseall: release
 
 release: runcimage
@@ -147,12 +160,13 @@ install-man: man
 	install -D -m 644 man/man8/*.8 $(DESTDIR)$(MANDIR)/man8
 
 clean:
-	rm -f runc runc-*
+	rm -f runc runc-* libcontainer/dmz/runc-dmz
+	rm -f contrib/cmd/fs-idmap/fs-idmap
 	rm -f contrib/cmd/recvtty/recvtty
 	rm -f contrib/cmd/sd-helper/sd-helper
 	rm -f contrib/cmd/seccompagent/seccompagent
-	rm -f contrib/cmd/fs-idmap/fs-idmap
-	rm -rf release
+	rm -f contrib/cmd/memfd-bind/memfd-bind
+	sudo rm -rf release
 	rm -rf man/man8
 
 cfmt: C_SRC=$(shell git ls-files '*.c' | grep -v '^vendor/')
@@ -188,6 +202,18 @@ verify-dependencies: vendor
 	@test -z "$$(git status --porcelain -- go.mod go.sum vendor/)" \
 		|| (echo -e "git status:\n $$(git status -- go.mod go.sum vendor/)\nerror: vendor/, go.mod and/or go.sum not up to date. Run \"make vendor\" to update"; exit 1) \
 		&& echo "all vendor files are up to date."
+verify-dmz-arch:
+	@test -s libcontainer/dmz/runc-dmz || exit 0; \
+		set -Eeuo pipefail; \
+		export LC_ALL=C; \
+		echo "readelf -h runc"; \
+		readelf -h runc | grep -E "(Machine|Flags):"; \
+		echo "readelf -h libcontainer/dmz/runc-dmz"; \
+		readelf -h libcontainer/dmz/runc-dmz | grep -E "(Machine|Flags):"; \
+		diff -u \
+			<(readelf -h runc | grep -E "(Machine|Flags):") \
+			<(readelf -h libcontainer/dmz/runc-dmz | grep -E "(Machine|Flags):") \
+		&& echo "runc-dmz architecture matches runc binary."
 
 validate-keyring:
 	script/keyring_validate.sh
@@ -197,4 +223,4 @@ validate-keyring:
 	test localtest unittest localunittest integration localintegration \
 	rootlessintegration localrootlessintegration shell install install-bash \
 	install-man clean cfmt shfmt localshfmt shellcheck \
-	vendor verify-changelog verify-dependencies validate-keyring
+	vendor verify-changelog verify-dependencies verify-dmz-arch validate-keyring
@@ -65,15 +65,18 @@ e.g. to disable seccomp:
 make BUILDTAGS=""
 ```
 
-| Build Tag | Feature                            | Enabled by default | Dependency |
-|-----------|------------------------------------|--------------------|------------|
-| seccomp   | Syscall filtering                  | yes                | libseccomp |
+| Build Tag     | Feature                               | Enabled by Default | Dependencies        |
+|---------------|---------------------------------------|--------------------|---------------------|
+| `seccomp`     | Syscall filtering using `libseccomp`. | yes                | `libseccomp`        |
+| `!runc_nodmz` | Reduce memory usage for CVE-2019-5736 protection by using a small C binary, [see `memfd-bind` for more details][contrib-memfd-bind]. `runc_nodmz` disables this feature and causes runc to use a different protection mechanism which will further increases memory usage temporarily during container startup. This feature can also be disabled at runtime by setting the `RUNC_DMZ=legacy` environment variable. | yes ||
 
 The following build tags were used earlier, but are now obsoleted:
  - **nokmem** (since runc v1.0.0-rc94 kernel memory settings are ignored)
  - **apparmor** (since runc v1.0.0-rc93 the feature is always enabled)
  - **selinux**  (since runc v1.0.0-rc93 the feature is always enabled)
 
+ [contrib-memfd-bind]: /contrib/memfd-bind/README.md
+
 ### Running the test suite
 
 `runc` currently supports running its test suite via Docker.
 
@@ -0,0 +1,61 @@
+# NOTE: Make sure you keep this file in sync with scripts/lib.sh.
+
+GO ?= go
+GOARCH ?= $(shell $(GO) env GOARCH)
+
+ifneq ($(shell grep -i "ID_LIKE=.*suse" /etc/os-release),)
+	# openSUSE has a custom PLATFORM
+	PLATFORM ?= suse-linux
+	IS_SUSE := 1
+else
+	PLATFORM ?= linux-gnu
+endif
+
+ifeq ($(GOARCH),$(shell GOARCH= $(GO) env GOARCH))
+	# use the native CC and STRIP
+	HOST :=
+else ifeq ($(GOARCH),386)
+	# Always use the 64-bit compiler to build the 386 binary, which works for
+	# the more common cross-build method for x86 (namely, the equivalent of
+	# dpkg --add-architecture).
+	ifdef IS_SUSE
+		# There is no x86_64-suse-linux-gcc, so use the native one.
+		HOST :=
+		CPU_TYPE := i586
+	else
+		HOST := x86_64-$(PLATFORM)-
+		CPU_TYPE := i686
+	endif
+	CFLAGS := -m32 -march=$(CPU_TYPE) $(CFLAGS)
+else ifeq ($(GOARCH),amd64)
+	ifdef IS_SUSE
+		# There is no x86_64-suse-linux-gcc, so use the native one.
+		HOST :=
+	else
+		HOST := x86_64-$(PLATFORM)-
+	endif
+else ifeq ($(GOARCH),arm64)
+	HOST := aarch64-$(PLATFORM)-
+else ifeq ($(GOARCH),arm)
+	# HOST already configured by release_build.sh in this case.
+else ifeq ($(GOARCH),armel)
+	HOST := arm-$(PLATFORM)eabi-
+else ifeq ($(GOARCH),armhf)
+	HOST := arm-$(PLATFORM)eabihf-
+else ifeq ($(GOARCH),ppc64le)
+	HOST := powerpc64le-$(PLATFORM)-
+else ifeq ($(GOARCH),riscv64)
+	HOST := riscv64-$(PLATFORM)-
+else ifeq ($(GOARCH),s390x)
+	HOST := s390x-$(PLATFORM)-
+else
+$(error Unsupported GOARCH $(GOARCH))
+endif
+
+ifeq ($(origin CC),$(filter $(origin CC),undefined default))
+	# Override CC if it's undefined or just the default value set by Make.
+	CC := $(HOST)gcc
+	export CC
+endif
+STRIP ?= $(HOST)strip
+export STRIP
@@ -0,0 +1,67 @@
+## memfd-bind ##
+
+`runc` normally has to make a binary copy of itself (or of a smaller helper
+binary called `runc-dmz`) when constructing a container process in order to
+defend against certain container runtime attacks such as CVE-2019-5736.
+
+This cloned binary only exists until the container process starts (this means
+for `runc run` and `runc exec`, it only exists for a few hundred milliseconds
+-- for `runc create` it exists until `runc start` is called). However, because
+the clone is done using a memfd (or by creating files in directories that are
+likely to be a `tmpfs`), this can lead to temporary increases in *host* memory
+usage. Unless you are running on a cgroupv1 system with the cgroupv1 memory
+controller enabled and the (deprecated) `memory.move_charge_at_immigrate`
+enabled, there is no effect on the container's memory.
+
+However, for certain configurations this can still be undesirable. This daemon
+allows you to create a sealed memfd copy of the `runc` binary, which will cause
+`runc` to skip all binary copying, resulting in no additional memory usage for
+each container process (instead there is a single in-memory copy of the
+binary). It should be noted that (strictly speaking) this is slightly less
+secure if you are concerned about Dirty Cow-like 0-day kernel vulnerabilities,
+but for most users the security benefit is identical.
+
+The provided `[email protected]` file can be used to get systemd to manage
+this daemon. You can supply the path like so:
+
+```
+% systemctl start memfd-bind@/usr/bin/runc
+```
+
+Thus, there are three ways of protecting against CVE-2019-5736, in order of how
+much memory usage they can use:
+
+* `memfd-bind` only creates a single in-memory copy of the `runc` binary (about
+  10MB), regardless of how many containers are running.
+
+* `runc-dmz` is (depending on which libc it was compiled with) between 10kB and
+  1MB in size, and a copy is created once per process spawned inside a
+  container by runc (both the pid1 and every `runc exec`). There are
+  circumstances where using `runc-dmz` will fail in ways that runc cannot
+  predict ahead of time (such as restrictive LSMs applied to containers), in
+  which case users can disable it with the `RUNC_DMZ=legacy` setting.
+  `runc-dmz` also requires an additional `execve` over the other options,
+  though since the binary is so small the cost is probably not even noticeable.
+
+* The classic method of making a copy of the entire `runc` binary during
+  container process setup takes up about 10MB per process spawned inside the
+  container by runc (both pid1 and `runc exec`).
+
+### Caveats ###
+
+There are several downsides with using `memfd-bind` on the `runc` binary:
+
+* The `memfd-bind` process needs to continue to run indefinitely in order for
+  the memfd reference to stay alive. If the process is forcefully killed, the
+  bind-mount on top of the `runc` binary will become stale and nobody will be
+  able to execute it (you can use `memfd-bind --cleanup` to clean up the stale
+  mount).
+
+* Only root can execute the cloned binary due to permission restrictions on
+  accessing other process's files. More specifically, only users with ptrace
+  privileges over the memfd-bind daemon can access the file (but in practice
+  this is usually only root).
+
+* When updating `runc`, the daemon needs to be stopped before the update (so
+  the package manager can access the underlying file) and then restarted after
+  the update.