Skip to content

Commit 0e9a335

Browse files
committed
nsexec: migrate memfd /proc/self/exe logic to Go code
This allow us to remove the amount of C code in runc quite substantially, as well as removing a whole execve(2) from the nsexec path because we no longer spawn "runc init" only to re-exec "runc init" after doing the clone. Signed-off-by: Aleksa Sarai <[email protected]>
1 parent 321aa20 commit 0e9a335

File tree

6 files changed

+286
-598
lines changed

6 files changed

+286
-598
lines changed

libcontainer/container_linux.go

Lines changed: 54 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import (
2424

2525
"github.com/opencontainers/runc/libcontainer/cgroups"
2626
"github.com/opencontainers/runc/libcontainer/configs"
27+
"github.com/opencontainers/runc/libcontainer/dmz"
2728
"github.com/opencontainers/runc/libcontainer/intelrdt"
2829
"github.com/opencontainers/runc/libcontainer/system"
2930
"github.com/opencontainers/runc/libcontainer/utils"
@@ -316,6 +317,8 @@ func (c *Container) start(process *Process) (retErr error) {
316317
if err != nil {
317318
return fmt.Errorf("unable to create new parent process: %w", err)
318319
}
320+
// We do not need the cloned binaries once the process is spawned.
321+
defer process.closeClonedExes()
319322

320323
logsDone := parent.forwardChildLogs()
321324
if logsDone != nil {
@@ -454,24 +457,30 @@ func (c *Container) newParentProcess(p *Process) (parentProcess, error) {
454457
}
455458
logFilePair := filePair{parentLogPipe, childLogPipe}
456459

457-
cmd := c.commandTemplate(p, childInitPipe, childLogPipe)
458-
if !p.Init {
459-
return c.newSetnsProcess(p, cmd, messageSockPair, logFilePair)
460+
// Make sure we use a new safe copy of /proc/self/exe each time this is
461+
// called, to make sure that if a container manages to overwrite the file
462+
// it cannot affect other containers on the system. For runc, this code
463+
// will only ever be called once, but libcontainer users might call this
464+
// more than once.
465+
p.closeClonedExes()
466+
var (
467+
exePath string
468+
safeExe *os.File
469+
)
470+
if dmz.IsSelfExeCloned() {
471+
// /proc/self/exe is already a cloned binary -- no need to do anything
472+
logrus.Debug("skipping binary cloning -- /proc/self/exe is already cloned!")
473+
exePath = "/proc/self/exe"
474+
} else {
475+
safeExe, err = dmz.CloneSelfExe(c.root)
476+
if err != nil {
477+
return nil, fmt.Errorf("unable to create safe /proc/self/exe clone for runc init: %w", err)
478+
}
479+
exePath = "/proc/self/fd/" + strconv.Itoa(int(safeExe.Fd()))
480+
p.clonedExes = append(p.clonedExes, safeExe)
460481
}
461482

462-
// We only set up fifoFd if we're not doing a `runc exec`. The historic
463-
// reason for this is that previously we would pass a dirfd that allowed
464-
// for container rootfs escape (and not doing it in `runc exec` avoided
465-
// that problem), but we no longer do that. However, there's no need to do
466-
// this for `runc exec` so we just keep it this way to be safe.
467-
if err := c.includeExecFifo(cmd); err != nil {
468-
return nil, fmt.Errorf("unable to setup exec fifo: %w", err)
469-
}
470-
return c.newInitProcess(p, cmd, messageSockPair, logFilePair)
471-
}
472-
473-
func (c *Container) commandTemplate(p *Process, childInitPipe *os.File, childLogPipe *os.File) *exec.Cmd {
474-
cmd := exec.Command("/proc/self/exe", "init")
483+
cmd := exec.Command(exePath, "init")
475484
cmd.Args[0] = os.Args[0]
476485
cmd.Stdin = p.Stdin
477486
cmd.Stdout = p.Stdout
@@ -501,13 +510,38 @@ func (c *Container) commandTemplate(p *Process, childInitPipe *os.File, childLog
501510
cmd.Env = append(cmd.Env, "_LIBCONTAINER_LOGLEVEL="+p.LogLevel)
502511
}
503512

504-
// NOTE: when running a container with no PID namespace and the parent process spawning the container is
505-
// PID1 the pdeathsig is being delivered to the container's init process by the kernel for some reason
506-
// even with the parent still running.
513+
if safeExe != nil {
514+
// Due to a Go stdlib bug, we need to add safeExe to the set of
515+
// ExtraFiles otherwise it is possible for the stdlib to clobber the fd
516+
// during forkAndExecInChild1 and replace it with some other file that
517+
// might be malicious. This is less than ideal (because the descriptor
518+
// will be non-O_CLOEXEC) however we have protections in "runc init" to
519+
// stop us from leaking extra file descriptors.
520+
//
521+
// See <https://github.com/golang/go/issues/61751>.
522+
cmd.ExtraFiles = append(cmd.ExtraFiles, safeExe)
523+
}
524+
525+
// NOTE: when running a container with no PID namespace and the parent
526+
// process spawning the container is PID1 the pdeathsig is being
527+
// delivered to the container's init process by the kernel for some
528+
// reason even with the parent still running.
507529
if c.config.ParentDeathSignal > 0 {
508530
cmd.SysProcAttr.Pdeathsig = unix.Signal(c.config.ParentDeathSignal)
509531
}
510-
return cmd
532+
533+
if p.Init {
534+
// We only set up fifoFd if we're not doing a `runc exec`. The historic
535+
// reason for this is that previously we would pass a dirfd that allowed
536+
// for container rootfs escape (and not doing it in `runc exec` avoided
537+
// that problem), but we no longer do that. However, there's no need to do
538+
// this for `runc exec` so we just keep it this way to be safe.
539+
if err := c.includeExecFifo(cmd); err != nil {
540+
return nil, fmt.Errorf("unable to setup exec fifo: %w", err)
541+
}
542+
return c.newInitProcess(p, cmd, messageSockPair, logFilePair)
543+
}
544+
return c.newSetnsProcess(p, cmd, messageSockPair, logFilePair)
511545
}
512546

513547
// shouldSendMountSources says whether the child process must setup bind mounts with
Lines changed: 192 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,192 @@
1+
package dmz
2+
3+
import (
4+
"errors"
5+
"fmt"
6+
"io"
7+
"os"
8+
9+
"github.com/sirupsen/logrus"
10+
"golang.org/x/sys/unix"
11+
12+
"github.com/opencontainers/runc/libcontainer/system"
13+
)
14+
15+
type SealFunc func(**os.File) error
16+
17+
var (
18+
_ SealFunc = sealMemfd
19+
_ SealFunc = sealFile
20+
)
21+
22+
const baseMemfdSeals = unix.F_SEAL_SEAL | unix.F_SEAL_SHRINK | unix.F_SEAL_GROW | unix.F_SEAL_WRITE
23+
24+
func sealMemfd(f **os.File) error {
25+
if err := (*f).Chmod(0o511); err != nil {
26+
return err
27+
}
28+
// Try to set the newer memfd sealing flags, but we ignore
29+
// errors because they are not needed and we want to continue
30+
// to work on older kernels.
31+
fd := (*f).Fd()
32+
// F_SEAL_FUTURE_WRITE -- Linux 5.1
33+
_, _ = unix.FcntlInt(fd, unix.F_ADD_SEALS, unix.F_SEAL_FUTURE_WRITE)
34+
// F_SEAL_EXEC -- Linux 6.3
35+
const F_SEAL_EXEC = 0x20 //nolint:revive // this matches the unix.* name
36+
_, _ = unix.FcntlInt(fd, unix.F_ADD_SEALS, F_SEAL_EXEC)
37+
// Apply all original memfd seals.
38+
_, err := unix.FcntlInt(fd, unix.F_ADD_SEALS, baseMemfdSeals)
39+
return os.NewSyscallError("fcntl(F_ADD_SEALS)", err)
40+
}
41+
42+
// Memfd creates a sealable executable memfd (supported since Linux 3.17).
43+
func Memfd(comment string) (*os.File, SealFunc, error) {
44+
file, err := system.ExecutableMemfd("runc_cloned:"+comment, unix.MFD_ALLOW_SEALING|unix.MFD_CLOEXEC)
45+
return file, sealMemfd, err
46+
}
47+
48+
func sealFile(f **os.File) error {
49+
if err := (*f).Chmod(0o511); err != nil {
50+
return err
51+
}
52+
// When sealing an O_TMPFILE-style descriptor we need to
53+
// re-open the path as O_PATH to clear the existing write
54+
// handle we have.
55+
opath, err := os.OpenFile(fmt.Sprintf("/proc/self/fd/%d", (*f).Fd()), unix.O_PATH|unix.O_CLOEXEC, 0)
56+
if err != nil {
57+
return fmt.Errorf("reopen tmpfile: %w", err)
58+
}
59+
_ = (*f).Close()
60+
*f = opath
61+
return nil
62+
}
63+
64+
// otmpfile creates an open(O_TMPFILE) file in the given directory (supported
65+
// since Linux 3.11).
66+
func otmpfile(dir string) (*os.File, SealFunc, error) {
67+
file, err := os.OpenFile(dir, unix.O_TMPFILE|unix.O_RDWR|unix.O_EXCL|unix.O_CLOEXEC, 0o700)
68+
if err != nil {
69+
return nil, nil, fmt.Errorf("O_TMPFILE creation failed: %w", err)
70+
}
71+
// Make sure we actually got an unlinked O_TMPFILE descriptor.
72+
var stat unix.Stat_t
73+
if err := unix.Fstat(int(file.Fd()), &stat); err != nil {
74+
file.Close()
75+
return nil, nil, fmt.Errorf("cannot fstat O_TMPFILE fd: %w", err)
76+
} else if stat.Nlink != 0 {
77+
file.Close()
78+
return nil, nil, errors.New("O_TMPFILE has non-zero nlink")
79+
}
80+
return file, sealFile, err
81+
}
82+
83+
// mktemp creates a classic unlinked file in the given directory.
84+
func mktemp(dir string) (*os.File, SealFunc, error) {
85+
file, err := os.CreateTemp(dir, "runc.")
86+
if err != nil {
87+
return nil, nil, err
88+
}
89+
// Unlink the file and verify it was unlinked.
90+
if err := os.Remove(file.Name()); err != nil {
91+
return nil, nil, fmt.Errorf("unlinking classic tmpfile: %w", err)
92+
}
93+
var stat unix.Stat_t
94+
if err := unix.Fstat(int(file.Fd()), &stat); err != nil {
95+
return nil, nil, fmt.Errorf("cannot fstat classic tmpfile: %w", err)
96+
} else if stat.Nlink != 0 {
97+
return nil, nil, fmt.Errorf("classic tmpfile %s has non-zero nlink after unlink", file.Name())
98+
}
99+
return file, sealFile, err
100+
}
101+
102+
func getSealableFile(comment, tmpDir string) (file *os.File, sealFn SealFunc, err error) {
103+
// First, try an executable memfd (supported since Linux 3.17).
104+
file, sealFn, err = Memfd(comment)
105+
if err == nil {
106+
return
107+
}
108+
logrus.Debugf("memfd cloned binary failed, falling back to O_TMPFILE: %v", err)
109+
// Try to fallback to O_TMPFILE (supported since Linux 3.11).
110+
file, sealFn, err = otmpfile(tmpDir)
111+
if err == nil {
112+
return
113+
}
114+
logrus.Debugf("O_TMPFILE cloned binary failed, falling back to mktemp(): %v", err)
115+
// Finally, try a classic unlinked temporary file.
116+
file, sealFn, err = mktemp(tmpDir)
117+
if err == nil {
118+
return
119+
}
120+
return nil, nil, fmt.Errorf("could not create sealable file for cloned binary: %w", err)
121+
}
122+
123+
// CloneBinary creates a "sealed" clone of a given binary, which can be used to
124+
// thwart attempts by the container process to gain access to host binaries
125+
// through procfs magic-link shenanigans. For more details on why this is
126+
// necessary, see CVE-2019-5736.
127+
func CloneBinary(src io.Reader, size int64, name, tmpDir string) (*os.File, error) {
128+
logrus.Debugf("cloning %s binary (%d bytes)", name, size)
129+
file, sealFn, err := getSealableFile(name, tmpDir)
130+
if err != nil {
131+
return nil, err
132+
}
133+
copied, err := io.Copy(file, src)
134+
if err != nil {
135+
file.Close()
136+
return nil, fmt.Errorf("copy binary: %w", err)
137+
} else if copied != size {
138+
file.Close()
139+
return nil, fmt.Errorf("copied binary size mismatch: %d != %d", copied, size)
140+
}
141+
if err := sealFn(&file); err != nil {
142+
file.Close()
143+
return nil, fmt.Errorf("could not seal fd: %w", err)
144+
}
145+
return file, nil
146+
}
147+
148+
// IsCloned returns whether the given file can be guaranteed to be a safe exe.
149+
func IsCloned(exe *os.File) bool {
150+
seals, err := unix.FcntlInt(exe.Fd(), unix.F_GET_SEALS, 0)
151+
if err != nil {
152+
// /proc/self/exe is probably not a memfd
153+
logrus.Debugf("F_GET_SEALS on %s failed: %v", exe.Name(), err)
154+
return false
155+
}
156+
// The memfd must have all of the base seals applied.
157+
logrus.Debugf("checking %s memfd seals: 0x%x", exe.Name(), seals)
158+
return seals&baseMemfdSeals == baseMemfdSeals
159+
}
160+
161+
// CloneSelfExe makes a clone of the current process's binary (through
162+
// /proc/self/exe). This binary can then be used for "runc init" in order to
163+
// make sure the container process can never resolve the original runc binary.
164+
// For more details on why this is necessary, see CVE-2019-5736.
165+
func CloneSelfExe(tmpDir string) (*os.File, error) {
166+
selfExe, err := os.Open("/proc/self/exe")
167+
if err != nil {
168+
return nil, fmt.Errorf("opening current binary: %w", err)
169+
}
170+
defer selfExe.Close()
171+
172+
stat, err := selfExe.Stat()
173+
if err != nil {
174+
return nil, fmt.Errorf("checking /proc/self/exe size: %w", err)
175+
}
176+
size := stat.Size()
177+
178+
return CloneBinary(selfExe, size, "/proc/self/exe", tmpDir)
179+
}
180+
181+
// IsSelfExeCloned returns whether /proc/self/exe is a cloned binary that can
182+
// be guaranteed to be safe. This means that it must be a sealed memfd. Other
183+
// types of clones cannot be completely verified as safe.
184+
func IsSelfExeCloned() bool {
185+
selfExe, err := os.Open("/proc/self/exe")
186+
if err != nil {
187+
logrus.Debugf("open /proc/self/exe failed: %v", err)
188+
return false
189+
}
190+
defer selfExe.Close()
191+
return IsCloned(selfExe)
192+
}

0 commit comments

Comments
 (0)