diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 56bae35f36f..8d35c69c339 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -78,6 +78,15 @@ jobs: with: bats-version: 1.9.0 + - name: procfs mount + run: | + # Get the list of mounts to help with debugging. + cat /proc/self/mounts + # Create a procfs mount that is not masked, to ensure that container + # procfs mounts will succeed. + sudo mkdir -p /tmp/.procfs-stashed-mount + sudo unshare -pf mount -t proc -o subset=pid proc /tmp/.procfs-stashed-mount + - name: unit test if: matrix.rootless != 'rootless' run: sudo -E PATH="$PATH" -- make TESTFLAGS="${{ matrix.race }}" localunittest diff --git a/Makefile b/Makefile index c72dd2f414d..0d48fe8c521 100644 --- a/Makefile +++ b/Makefile @@ -105,7 +105,7 @@ unittest: runcimage -t --privileged --rm \ -v /lib/modules:/lib/modules:ro \ -v $(CURDIR):/go/src/$(PROJECT) \ - $(RUNC_IMAGE) make localunittest TESTFLAGS=$(TESTFLAGS) + $(RUNC_IMAGE) make localunittest TESTFLAGS="$(TESTFLAGS)" localunittest: all $(GO) test -timeout 3m -tags "$(BUILDTAGS)" $(TESTFLAGS) -v ./... @@ -115,7 +115,7 @@ integration: runcimage -t --privileged --rm \ -v /lib/modules:/lib/modules:ro \ -v $(CURDIR):/go/src/$(PROJECT) \ - $(RUNC_IMAGE) make localintegration TESTPATH=$(TESTPATH) + $(RUNC_IMAGE) make localintegration TESTPATH="$(TESTPATH)" localintegration: all bats -t tests/integration$(TESTPATH) diff --git a/contrib/cmd/recvtty/recvtty.go b/contrib/cmd/recvtty/recvtty.go index 35c293de642..532a4c33904 100644 --- a/contrib/cmd/recvtty/recvtty.go +++ b/contrib/cmd/recvtty/recvtty.go @@ -98,7 +98,7 @@ func handleSingle(path string, noStdin bool) error { defer socket.Close() // Get the master file descriptor from runC. - master, err := utils.RecvFd(socket) + master, err := utils.RecvFile(socket) if err != nil { return err } @@ -171,7 +171,7 @@ func handleNull(path string) error { defer socket.Close() // Get the master file descriptor from runC. - master, err := utils.RecvFd(socket) + master, err := utils.RecvFile(socket) if err != nil { return } diff --git a/libcontainer/configs/mount_linux.go b/libcontainer/configs/mount_linux.go index a4275df3a03..0cec2e5aed4 100644 --- a/libcontainer/configs/mount_linux.go +++ b/libcontainer/configs/mount_linux.go @@ -6,6 +6,14 @@ type Mount struct { // Source path for the mount. Source string `json:"source"` + // SourceFd is a open_tree(2)-style file descriptor created with the new + // mount API. If non-nil, this indicates that SourceFd is a configured + // mountfd that should be used for the mount into the container. + // + // Note that you cannot use /proc/self/fd/$n-style paths with + // open_tree(2)-style file descriptors. + SourceFd *int + // Destination path for the mount inside the container. Destination string `json:"destination"` @@ -34,13 +42,13 @@ type Mount struct { // Note that, the underlying filesystem should support this feature to be // used. // Every mount point could have its own mapping. - UIDMappings []IDMap `json:"uidMappings,omitempty"` + UIDMappings []IDMap `json:"uid_mappings,omitempty"` // GIDMappings is used to changing file group owners w/o calling chown. // Note that, the underlying filesystem should support this feature to be // used. // Every mount point could have its own mapping. - GIDMappings []IDMap `json:"gidMappings,omitempty"` + GIDMappings []IDMap `json:"gid_mappings,omitempty"` } func (m *Mount) IsBind() bool { diff --git a/libcontainer/configs/validate/validator.go b/libcontainer/configs/validate/validator.go index 196b431dba1..d4ca4bd707e 100644 --- a/libcontainer/configs/validate/validator.go +++ b/libcontainer/configs/validate/validator.go @@ -259,24 +259,11 @@ func checkIDMapMounts(config *configs.Config, m *configs.Mount) error { } if !m.IsBind() { - return fmt.Errorf("gidMappings/uidMappings is supported only for mounts with the option 'bind'") + return fmt.Errorf("id-mapped mounts are only supported for bind-mounts") } if config.RootlessEUID { - return fmt.Errorf("gidMappings/uidMappings is not supported when runc is being launched with EUID != 0, needs CAP_SYS_ADMIN on the runc parent's user namespace") + return fmt.Errorf("id-mapped mounts are not supported for rootless containers") } - if len(config.UIDMappings) == 0 || len(config.GIDMappings) == 0 { - return fmt.Errorf("not yet supported to use gidMappings/uidMappings in a mount without also using a user namespace") - } - if !sameMapping(config.UIDMappings, m.UIDMappings) { - return fmt.Errorf("not yet supported for the mount uidMappings to be different than user namespace uidMapping") - } - if !sameMapping(config.GIDMappings, m.GIDMappings) { - return fmt.Errorf("not yet supported for the mount gidMappings to be different than user namespace gidMapping") - } - if !filepath.IsAbs(m.Source) { - return fmt.Errorf("mount source not absolute") - } - return nil } @@ -298,27 +285,6 @@ func mounts(config *configs.Config) error { return nil } -// sameMapping checks if the mappings are the same. If the mappings are the same -// but in different order, it returns false. -func sameMapping(a, b []configs.IDMap) bool { - if len(a) != len(b) { - return false - } - - for i := range a { - if a[i].ContainerID != b[i].ContainerID { - return false - } - if a[i].HostID != b[i].HostID { - return false - } - if a[i].Size != b[i].Size { - return false - } - } - return true -} - func isHostNetNS(path string) (bool, error) { const currentProcessNetns = "/proc/self/ns/net" diff --git a/libcontainer/configs/validate/validator_test.go b/libcontainer/configs/validate/validator_test.go index 58aae7a68f8..18f6814a380 100644 --- a/libcontainer/configs/validate/validator_test.go +++ b/libcontainer/configs/validate/validator_test.go @@ -436,22 +436,7 @@ func TestValidateIDMapMounts(t *testing.T) { }, }, { - name: "idmap mount without userns mappings", - isErr: true, - config: &configs.Config{ - Mounts: []*configs.Mount{ - { - Source: "/abs/path/", - Destination: "/abs/path/", - Flags: unix.MS_BIND, - UIDMappings: mapping, - GIDMappings: mapping, - }, - }, - }, - }, - { - name: "idmap mounts with different userns and mount mappings", + name: "idmap mounts without abs dest path", isErr: true, config: &configs.Config{ UIDMappings: mapping, @@ -459,54 +444,40 @@ func TestValidateIDMapMounts(t *testing.T) { Mounts: []*configs.Mount{ { Source: "/abs/path/", - Destination: "/abs/path/", + Destination: "./rel/path/", Flags: unix.MS_BIND, - UIDMappings: []configs.IDMap{ - { - ContainerID: 10, - HostID: 10, - Size: 1, - }, - }, + UIDMappings: mapping, GIDMappings: mapping, }, }, }, }, { - name: "idmap mounts with different userns and mount mappings", - isErr: true, + name: "simple idmap mount", config: &configs.Config{ UIDMappings: mapping, GIDMappings: mapping, Mounts: []*configs.Mount{ { - Source: "/abs/path/", + Source: "/another-abs/path/", Destination: "/abs/path/", Flags: unix.MS_BIND, UIDMappings: mapping, - GIDMappings: []configs.IDMap{ - { - ContainerID: 10, - HostID: 10, - Size: 1, - }, - }, + GIDMappings: mapping, }, }, }, }, { - name: "idmap mounts without abs source path", - isErr: true, + name: "idmap mount with more flags", config: &configs.Config{ UIDMappings: mapping, GIDMappings: mapping, Mounts: []*configs.Mount{ { - Source: "./rel/path/", + Source: "/another-abs/path/", Destination: "/abs/path/", - Flags: unix.MS_BIND, + Flags: unix.MS_BIND | unix.MS_RDONLY, UIDMappings: mapping, GIDMappings: mapping, }, @@ -514,15 +485,12 @@ func TestValidateIDMapMounts(t *testing.T) { }, }, { - name: "idmap mounts without abs dest path", - isErr: true, + name: "idmap mount without userns mappings", config: &configs.Config{ - UIDMappings: mapping, - GIDMappings: mapping, Mounts: []*configs.Mount{ { Source: "/abs/path/", - Destination: "./rel/path/", + Destination: "/abs/path/", Flags: unix.MS_BIND, UIDMappings: mapping, GIDMappings: mapping, @@ -530,35 +498,46 @@ func TestValidateIDMapMounts(t *testing.T) { }, }, }, - { - name: "simple idmap mount", + name: "idmap mounts with different userns and mount mappings", config: &configs.Config{ UIDMappings: mapping, GIDMappings: mapping, Mounts: []*configs.Mount{ { - Source: "/another-abs/path/", + Source: "/abs/path/", Destination: "/abs/path/", Flags: unix.MS_BIND, - UIDMappings: mapping, + UIDMappings: []configs.IDMap{ + { + ContainerID: 10, + HostID: 10, + Size: 1, + }, + }, GIDMappings: mapping, }, }, }, }, { - name: "idmap mount with more flags", + name: "idmap mounts with different userns and mount mappings", config: &configs.Config{ UIDMappings: mapping, GIDMappings: mapping, Mounts: []*configs.Mount{ { - Source: "/another-abs/path/", + Source: "/abs/path/", Destination: "/abs/path/", - Flags: unix.MS_BIND | unix.MS_RDONLY, + Flags: unix.MS_BIND, UIDMappings: mapping, - GIDMappings: mapping, + GIDMappings: []configs.IDMap{ + { + ContainerID: 10, + HostID: 10, + Size: 1, + }, + }, }, }, }, diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index 889dadd34d1..11ea28afcad 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -2,7 +2,6 @@ package libcontainer import ( "bytes" - "encoding/json" "errors" "fmt" "io" @@ -45,6 +44,7 @@ type Container struct { state containerState created time.Time fifo *os.File + safeExeFile *os.File } // State represents a running container's state @@ -316,6 +316,12 @@ func (c *Container) start(process *Process) (retErr error) { if err != nil { return fmt.Errorf("unable to create new parent process: %w", err) } + // This is no longer needed after the process has been spawned. We also + // want to make sure that (especially in the case of O_TMPFILE descriptors) + // that we use a new copy for each execution, because an attacker + // overwriting our copy would be just as bad as overwiting the host runc + // binary if we re-use the copy. + defer c.clearSafeExe() logsDone := parent.forwardChildLogs() if logsDone != nil { @@ -441,6 +447,125 @@ func (c *Container) includeExecFifo(cmd *exec.Cmd) error { return nil } +func (c *Container) clearSafeExe() { + if c.safeExeFile != nil { + _ = c.safeExeFile.Close() + c.safeExeFile = nil + } +} + +// makeSafeExe makes a copy of /proc/self/exe that is safe to use when +// executing inside a container. On modern kernels, this is a locked executable +// memfd that contains a copy of /proc/self/exe. The returned string is a +// /proc/self/fd/... path that can be used directly with "os/exec".Command(). +// For more details on why this is necessary, see CVE-2019-5736. +func (c *Container) makeSafeExe() (path string, Err error) { + if c.safeExeFile == nil { + var err error + var sealFn func(**os.File) error + + // Close safeExeFile if we fail to make it properly. + defer func() { + if c.safeExeFile != nil && Err != nil { + c.clearSafeExe() + } + }() + + // First, try an executable memfd (supported since Linux 3.17). + c.safeExeFile, err = system.ExecutableMemfd("runc_cloned:/proc/self/exe", unix.MFD_ALLOW_SEALING|unix.MFD_CLOEXEC) + if err != nil { + logrus.Debugf("memfd cloned binary failed, falling back to O_TMPFILE: %v", err) + } else { + sealFn = func(f **os.File) error { + if err := (*f).Chmod(0o511); err != nil { + return fmt.Errorf("chmod memfd: %w", err) + } + // Try to set the newer memfd sealing flags, but we ignore + // errors because they are not needed and we want to continue + // to work on older kernels. + fd := (*f).Fd() + // F_SEAL_FUTURE_WRITE -- Linux 5.1 + _, _ = unix.FcntlInt(fd, unix.F_ADD_SEALS, unix.F_SEAL_FUTURE_WRITE) + // F_SEAL_EXEC -- Linux 6.3 + const F_SEAL_EXEC = 0x2000 //nolint:revive // this matches the unix.* name + _, _ = unix.FcntlInt(fd, unix.F_ADD_SEALS, F_SEAL_EXEC) + // Apply all original memfd seals. + _, err := unix.FcntlInt(fd, unix.F_ADD_SEALS, unix.F_SEAL_SEAL|unix.F_SEAL_SHRINK|unix.F_SEAL_GROW|unix.F_SEAL_WRITE) + return os.NewSyscallError("fcntl(F_ADD_SEALS)", err) + } + } + + // Try to fallback to O_TMPFILE (supported since Linux 3.11). + if c.safeExeFile == nil { + var stat unix.Stat_t + c.safeExeFile, err = os.OpenFile(c.root, unix.O_TMPFILE|unix.O_RDWR|unix.O_EXCL|unix.O_CLOEXEC, 0o700) + if err != nil { + logrus.Debugf("O_TMPFILE cloned binary failed, falling back to mktemp(): %v", err) + } else if err := unix.Fstat(int(c.safeExeFile.Fd()), &stat); err != nil || stat.Nlink != 0 { + logrus.Debugf("O_TMPFILE cloned binary has non-zero nlink, falling back to mktemp(): %v", err) + c.clearSafeExe() + } + + // Finally, fallback to a classic temporary file we unlink. + if c.safeExeFile == nil { + c.safeExeFile, err = os.CreateTemp(c.root, "runc.") + if err != nil { + return "", fmt.Errorf("could not clone binary: %w", err) + } + // Unlink the file and verify it was unlinked. + if err := os.Remove(c.safeExeFile.Name()); err != nil { + return "", fmt.Errorf("unlinking classic tmpfile: %w", err) + } + if err := unix.Fstat(int(c.safeExeFile.Fd()), &stat); err != nil { + return "", fmt.Errorf("classic tmpfile fstat: %w", err) + } else if stat.Nlink != 0 { + return "", fmt.Errorf("classic tmpfile %s has non-zero nlink after unlink", c.safeExeFile.Name()) + } + } + sealFn = func(f **os.File) error { + if err := (*f).Chmod(0o511); err != nil { + return fmt.Errorf("chmod tmpfile: %w", err) + } + // When sealing an O_TMPFILE-style descriptor we need to + // re-open the path as O_PATH to clear the existing write + // handle we have. + opath, err := os.OpenFile(fmt.Sprintf("/proc/self/fd/%d", (*f).Fd()), unix.O_PATH|unix.O_CLOEXEC, 0) + if err != nil { + return fmt.Errorf("reopen tmpfile: %w", err) + } + _ = (*f).Close() + *f = opath + return nil + } + } + + // Copy the contents of /proc/self/exe to the cloned fd. + srcExeFile, err := os.Open("/proc/self/exe") + if err != nil { + return "", fmt.Errorf("cannot open current process exe: %w", err) + } + defer srcExeFile.Close() + stat, err := srcExeFile.Stat() + if err != nil { + return "", fmt.Errorf("checking /proc/self/exe size: %w", err) + } + exeSize := stat.Size() + + copied, err := system.Copy(c.safeExeFile, srcExeFile) + if err != nil { + return "", fmt.Errorf("copy binary: %w", err) + } else if copied != exeSize { + return "", fmt.Errorf("copied binary size mismatch: %d != %d", copied, exeSize) + } + + // Seal the descriptor. + if err := sealFn(&c.safeExeFile); err != nil { + return "", fmt.Errorf("could not seal fd: %w", err) + } + } + return fmt.Sprintf("/proc/self/fd/%d", c.safeExeFile.Fd()), nil +} + func (c *Container) newParentProcess(p *Process) (parentProcess, error) { parentInitPipe, childInitPipe, err := utils.NewSockPair("init") if err != nil { @@ -454,24 +579,12 @@ func (c *Container) newParentProcess(p *Process) (parentProcess, error) { } logFilePair := filePair{parentLogPipe, childLogPipe} - cmd := c.commandTemplate(p, childInitPipe, childLogPipe) - if !p.Init { - return c.newSetnsProcess(p, cmd, messageSockPair, logFilePair) - } - - // We only set up fifoFd if we're not doing a `runc exec`. The historic - // reason for this is that previously we would pass a dirfd that allowed - // for container rootfs escape (and not doing it in `runc exec` avoided - // that problem), but we no longer do that. However, there's no need to do - // this for `runc exec` so we just keep it this way to be safe. - if err := c.includeExecFifo(cmd); err != nil { - return nil, fmt.Errorf("unable to setup exec fifo: %w", err) + exePath, err := c.makeSafeExe() + if err != nil { + return nil, fmt.Errorf("unable to create safe /proc/self/exe clone: %w", err) } - return c.newInitProcess(p, cmd, messageSockPair, logFilePair) -} -func (c *Container) commandTemplate(p *Process, childInitPipe *os.File, childLogPipe *os.File) *exec.Cmd { - cmd := exec.Command("/proc/self/exe", "init") + cmd := exec.Command(exePath, "init") cmd.Args[0] = os.Args[0] cmd.Stdin = p.Stdin cmd.Stdout = p.Stdout @@ -500,118 +613,36 @@ func (c *Container) commandTemplate(p *Process, childInitPipe *os.File, childLog "_LIBCONTAINER_LOGLEVEL="+p.LogLevel, ) - // NOTE: when running a container with no PID namespace and the parent process spawning the container is - // PID1 the pdeathsig is being delivered to the container's init process by the kernel for some reason - // even with the parent still running. - if c.config.ParentDeathSignal > 0 { - cmd.SysProcAttr.Pdeathsig = unix.Signal(c.config.ParentDeathSignal) - } - return cmd -} - -// shouldSendMountSources says whether the child process must setup bind mounts with -// the source pre-opened (O_PATH) in the host user namespace. -// See https://github.com/opencontainers/runc/issues/2484 -func (c *Container) shouldSendMountSources() bool { - // Passing the mount sources via SCM_RIGHTS is only necessary when - // both userns and mntns are active. - if !c.config.Namespaces.Contains(configs.NEWUSER) || - !c.config.Namespaces.Contains(configs.NEWNS) { - return false - } - - // nsexec.c send_mountsources() requires setns(mntns) capabilities - // CAP_SYS_CHROOT and CAP_SYS_ADMIN. - if c.config.RootlessEUID { - return false - } - - // We need to send sources if there are non-idmap bind-mounts. - for _, m := range c.config.Mounts { - if m.IsBind() && !m.IsIDMapped() { - return true - } - } - - return false -} - -// shouldSendIdmapSources says whether the child process must setup idmap mounts with -// the mount_setattr already done in the host user namespace. -func (c *Container) shouldSendIdmapSources() bool { - // nsexec.c mount_setattr() requires CAP_SYS_ADMIN in: - // * the user namespace the filesystem was mounted in; - // * the user namespace we're trying to idmap the mount to; - // * the owning user namespace of the mount namespace you're currently located in. + // Due to a Go stdlib bug, we need to add c.safeExeFile to the set of + // ExtraFiles otherwise it is possible for the stdlib to clobber the fd + // during forkAndExecInChild1 and replace it with some other file that + // might be malicious. This is less than ideal (because the descriptor will + // be non-O_CLOEXEC) however we have protections in "runc init" to stop us + // from leaking extra file descriptors. // - // See the comment from Christian Brauner: - // https://github.com/opencontainers/runc/pull/3717#discussion_r1103607972 - // - // Let's just rule out rootless, we don't have those permission in the - // rootless case. - if c.config.RootlessEUID { - return false - } + // See . + cmd.ExtraFiles = append(cmd.ExtraFiles, c.safeExeFile) - // For the time being we require userns to be in use. - if !c.config.Namespaces.Contains(configs.NEWUSER) { - return false + // NOTE: when running a container with no PID namespace and the parent + // process spawning the container is PID1 the pdeathsig is being + // delivered to the container's init process by the kernel for some + // reason even with the parent still running. + if c.config.ParentDeathSignal > 0 { + cmd.SysProcAttr.Pdeathsig = unix.Signal(c.config.ParentDeathSignal) } - // We need to send sources if there are idmap bind-mounts. - for _, m := range c.config.Mounts { - if m.IsBind() && m.IsIDMapped() { - return true + if p.Init { + // We only set up fifoFd if we're not doing a `runc exec`. The historic + // reason for this is that previously we would pass a dirfd that allowed + // for container rootfs escape (and not doing it in `runc exec` avoided + // that problem), but we no longer do that. However, there's no need to do + // this for `runc exec` so we just keep it this way to be safe. + if err := c.includeExecFifo(cmd); err != nil { + return nil, fmt.Errorf("unable to setup exec fifo: %w", err) } + return c.newInitProcess(p, cmd, messageSockPair, logFilePair) } - - return false -} - -func (c *Container) sendMountSources(cmd *exec.Cmd, messageSockPair filePair) error { - if !c.shouldSendMountSources() { - return nil - } - - return c.sendFdsSources(cmd, messageSockPair, "_LIBCONTAINER_MOUNT_FDS", func(m *configs.Mount) bool { - return m.IsBind() && !m.IsIDMapped() - }) -} - -func (c *Container) sendIdmapSources(cmd *exec.Cmd, messageSockPair filePair) error { - if !c.shouldSendIdmapSources() { - return nil - } - - return c.sendFdsSources(cmd, messageSockPair, "_LIBCONTAINER_IDMAP_FDS", func(m *configs.Mount) bool { - return m.IsBind() && m.IsIDMapped() - }) -} - -func (c *Container) sendFdsSources(cmd *exec.Cmd, messageSockPair filePair, envVar string, condition func(*configs.Mount) bool) error { - // Elements on these slices will be paired with mounts (see StartInitialization() and - // prepareRootfs()). These slices MUST have the same size as c.config.Mounts. - fds := make([]int, len(c.config.Mounts)) - for i, m := range c.config.Mounts { - if !condition(m) { - // The -1 fd is ignored later. - fds[i] = -1 - continue - } - - // The fd passed here will not be used: nsexec.c will overwrite it with dup3(). We just need - // to allocate a fd so that we know the number to pass in the environment variable. The fd - // must not be closed before cmd.Start(), so we reuse messageSockPair.child because the - // lifecycle of that fd is already taken care of. - cmd.ExtraFiles = append(cmd.ExtraFiles, messageSockPair.child) - fds[i] = stdioFdCount + len(cmd.ExtraFiles) - 1 - } - fdsJSON, err := json.Marshal(fds) - if err != nil { - return fmt.Errorf("Error creating %v: %w", envVar, err) - } - cmd.Env = append(cmd.Env, envVar+"="+string(fdsJSON)) - return nil + return c.newSetnsProcess(p, cmd, messageSockPair, logFilePair) } func (c *Container) newInitProcess(p *Process, cmd *exec.Cmd, messageSockPair, logFilePair filePair) (*initProcess, error) { @@ -622,16 +653,10 @@ func (c *Container) newInitProcess(p *Process, cmd *exec.Cmd, messageSockPair, l nsMaps[ns.Type] = ns.Path } } - data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps, initStandard) + data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps) if err != nil { return nil, err } - if err := c.sendMountSources(cmd, messageSockPair); err != nil { - return nil, err - } - if err := c.sendIdmapSources(cmd, messageSockPair); err != nil { - return nil, err - } init := &initProcess{ cmd: cmd, @@ -656,7 +681,7 @@ func (c *Container) newSetnsProcess(p *Process, cmd *exec.Cmd, messageSockPair, } // for setns process, we don't have to set cloneflags as the process namespaces // will only be set via setns syscall - data, err := c.bootstrapData(0, state.NamespacePaths, initSetns) + data, err := c.bootstrapData(0, state.NamespacePaths) if err != nil { return nil, err } @@ -1046,7 +1071,7 @@ type netlinkError struct{ error } // such as one that uses nsenter package to bootstrap the container's // init process correctly, i.e. with correct namespaces, uid/gid // mapping etc. -func (c *Container) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string, it initType) (_ io.Reader, Err error) { +func (c *Container) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string) (_ io.Reader, Err error) { // create the netlink message r := nl.NewNetlinkRequest(int(InitMsg), 0) @@ -1148,48 +1173,6 @@ func (c *Container) bootstrapData(cloneFlags uintptr, nsMaps map[configs.Namespa Value: c.config.RootlessEUID, }) - // Bind mount source to open. - if it == initStandard && c.shouldSendMountSources() { - var mounts []byte - for _, m := range c.config.Mounts { - if m.IsBind() && !m.IsIDMapped() { - if strings.IndexByte(m.Source, 0) >= 0 { - return nil, fmt.Errorf("mount source string contains null byte: %q", m.Source) - } - mounts = append(mounts, []byte(m.Source)...) - } - mounts = append(mounts, byte(0)) - } - - r.AddData(&Bytemsg{ - Type: MountSourcesAttr, - Value: mounts, - }) - } - - // Idmap mount sources to open. - if it == initStandard && c.shouldSendIdmapSources() { - var mounts []byte - for _, m := range c.config.Mounts { - if m.IsBind() && m.IsIDMapped() { - // While other parts of the code check this too (like - // libcontainer/specconv/spec_linux.go) we do it here also because some libcontainer - // users don't use those functions. - if strings.IndexByte(m.Source, 0) >= 0 { - return nil, fmt.Errorf("mount source string contains null byte: %q", m.Source) - } - - mounts = append(mounts, []byte(m.Source)...) - } - mounts = append(mounts, byte(0)) - } - - r.AddData(&Bytemsg{ - Type: IdmapSourcesAttr, - Value: mounts, - }) - } - return bytes.NewReader(r.Serialize()), nil } diff --git a/libcontainer/criu_linux.go b/libcontainer/criu_linux.go index 31c8a900eb1..7b209aa07af 100644 --- a/libcontainer/criu_linux.go +++ b/libcontainer/criu_linux.go @@ -618,8 +618,8 @@ func (c *Container) prepareCriuRestoreMounts(mounts []*configs.Mount) error { // because during initial container creation mounts are // set up in the order they are configured. if m.Device == "bind" { - if err := utils.WithProcfd(c.config.Rootfs, m.Destination, func(dstFD string) error { - return mountViaFDs(m.Source, nil, m.Destination, dstFD, "", unix.MS_BIND|unix.MS_REC, "") + if err := utils.WithProcfd(c.config.Rootfs, m.Destination, func(dstFd string) error { + return mountViaFds(m.Source, nil, m.Destination, dstFd, "", unix.MS_BIND|unix.MS_REC, "") }); err != nil { return err } @@ -1185,7 +1185,7 @@ func (c *Container) criuNotifications(resp *criurpc.CriuResp, process *Process, defer master.Close() // While we can access console.master, using the API is a good idea. - if err := utils.SendFd(process.ConsoleSocket, master.Name(), master.Fd()); err != nil { + if err := utils.SendFile(process.ConsoleSocket, master); err != nil { return err } case "status-ready": diff --git a/libcontainer/factory_linux.go b/libcontainer/factory_linux.go index 4cb4a88bf0d..54a7bec91a1 100644 --- a/libcontainer/factory_linux.go +++ b/libcontainer/factory_linux.go @@ -214,18 +214,3 @@ func validateID(id string) error { return nil } - -func parseFdsFromEnv(envVar string) ([]int, error) { - fdsJSON := os.Getenv(envVar) - if fdsJSON == "" { - // Always return the nil slice if no fd is present. - return nil, nil - } - - var fds []int - if err := json.Unmarshal([]byte(fdsJSON), &fds); err != nil { - return nil, fmt.Errorf("Error unmarshalling %v: %w", envVar, err) - } - - return fds, nil -} diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go index 42cae1ccb65..d72ca8a3dc9 100644 --- a/libcontainer/init_linux.go +++ b/libcontainer/init_linux.go @@ -5,9 +5,9 @@ import ( "encoding/json" "errors" "fmt" - "io" "net" "os" + "runtime" "runtime/debug" "strconv" "strings" @@ -47,18 +47,6 @@ type network struct { TempVethPeerName string `json:"temp_veth_peer_name"` } -type mountFds struct { - // sourceFds are the fds to use as source when mounting. - // The slice size should be the same as container mounts, as it will be - // paired with them. - // The value -1 is used when no fd is needed for the mount. - // Can't have a valid fd in the same position that other slices in this struct. - // We need to use only one of these fds on any single mount. - sourceFds []int - // Idem sourceFds, but fds of already created idmap mounts, to use with unix.MoveMount(). - idmapFds []int -} - // initConfig is used for transferring parameters from Exec() to Init() type initConfig struct { Args []string `json:"args"` @@ -102,11 +90,8 @@ func StartInitialization() (retErr error) { defer func() { // We have an error during the initialization of the container's init, // send it back to the parent process in the form of an initError. - if err := writeSync(pipe, procError); err != nil { - fmt.Fprintln(os.Stderr, retErr) - return - } - if err := utils.WriteJSON(pipe, &initError{Message: retErr.Error()}); err != nil { + ierr := initError{Message: retErr.Error()} + if err := writeSyncArg(pipe, procError, ierr); err != nil { fmt.Fprintln(os.Stderr, retErr) return } @@ -139,18 +124,6 @@ func StartInitialization() (retErr error) { return fmt.Errorf("unable to convert _LIBCONTAINER_LOGPIPE: %w", err) } - // Get mount files (O_PATH). - mountSrcFds, err := parseFdsFromEnv("_LIBCONTAINER_MOUNT_FDS") - if err != nil { - return err - } - - // Get idmap fds. - idmapFds, err := parseFdsFromEnv("_LIBCONTAINER_IDMAP_FDS") - if err != nil { - return err - } - // clear the current process's environment to clean any libcontainer // specific env vars. os.Clearenv() @@ -166,10 +139,10 @@ func StartInitialization() (retErr error) { }() // If init succeeds, it will not return, hence none of the defers will be called. - return containerInit(it, pipe, consoleSocket, fifofd, logPipeFd, mountFds{sourceFds: mountSrcFds, idmapFds: idmapFds}) + return containerInit(it, pipe, consoleSocket, fifofd, logPipeFd) } -func containerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd, logFd int, mountFds mountFds) error { +func containerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd, logFd int) error { var config *initConfig if err := json.NewDecoder(pipe).Decode(&config); err != nil { return err @@ -179,11 +152,6 @@ func containerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd, lo } switch t { case initSetns: - // mount and idmap fds must be nil in this case. We don't mount while doing runc exec. - if mountFds.sourceFds != nil || mountFds.idmapFds != nil { - return errors.New("mount and idmap fds must be nil; can't mount from exec") - } - i := &linuxSetnsInit{ pipe: pipe, consoleSocket: consoleSocket, @@ -199,7 +167,6 @@ func containerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd, lo config: config, fifoFd: fifoFd, logFd: logFd, - mountFds: mountFds, } return i.Init() } @@ -317,7 +284,6 @@ func setupConsole(socket *os.File, config *initConfig, mount bool) error { if err != nil { return err } - // After we return from here, we don't need the console anymore. defer pty.Close() @@ -339,9 +305,11 @@ func setupConsole(socket *os.File, config *initConfig, mount bool) error { } } // While we can access console.master, using the API is a good idea. - if err := utils.SendFd(socket, pty.Name(), pty.Fd()); err != nil { + if err := utils.SendRawFd(socket, pty.Name(), pty.Fd()); err != nil { return err } + runtime.KeepAlive(pty) + // Now, dup over all the things. return dupStdio(slavePath) } @@ -349,12 +317,11 @@ func setupConsole(socket *os.File, config *initConfig, mount bool) error { // syncParentReady sends to the given pipe a JSON payload which indicates that // the init is ready to Exec the child process. It then waits for the parent to // indicate that it is cleared to Exec. -func syncParentReady(pipe io.ReadWriter) error { +func syncParentReady(pipe *os.File) error { // Tell parent. if err := writeSync(pipe, procReady); err != nil { return err } - // Wait for parent to give the all-clear. return readSync(pipe, procRun) } @@ -362,44 +329,37 @@ func syncParentReady(pipe io.ReadWriter) error { // syncParentHooks sends to the given pipe a JSON payload which indicates that // the parent should execute pre-start hooks. It then waits for the parent to // indicate that it is cleared to resume. -func syncParentHooks(pipe io.ReadWriter) error { +func syncParentHooks(pipe *os.File) error { // Tell parent. if err := writeSync(pipe, procHooks); err != nil { return err } - // Wait for parent to give the all-clear. return readSync(pipe, procResume) } -// syncParentSeccomp sends to the given pipe a JSON payload which -// indicates that the parent should pick up the seccomp fd with pidfd_getfd() -// and send it to the seccomp agent over a unix socket. It then waits for -// the parent to indicate that it is cleared to resume and closes the seccompFd. -// If the seccompFd is -1, there isn't anything to sync with the parent, so it -// returns no error. -func syncParentSeccomp(pipe io.ReadWriter, seccompFd int) error { - if seccompFd == -1 { +// syncParentSeccomp sends the fd associated with the seccomp file descriptor +// to the parent, and wait for the parent to do pidfd_getfd() to grab a copy. +func syncParentSeccomp(pipe *os.File, seccompFd *os.File) error { + if seccompFd == nil { return nil } - - // Tell parent. - if err := writeSyncWithFd(pipe, procSeccomp, seccompFd); err != nil { - unix.Close(seccompFd) + defer seccompFd.Close() + + // Tell parent to grab our fd. + // + // Notably, we do not use writeSyncFile here because a container might have + // an SCMP_ACT_NOTIFY action on sendmsg(2) so we need to use the smallest + // possible number of system calls here because all of those syscalls + // cannot be used with SCMP_ACT_NOTIFY as a result (any syscall we use here + // before the parent gets the file descriptor would deadlock "runc init" if + // we allowed it for SCMP_ACT_NOTIFY). See seccomp.InitSeccomp() for more + // details. + if err := writeSyncArg(pipe, procSeccomp, seccompFd.Fd()); err != nil { return err } - - // Wait for parent to give the all-clear. - if err := readSync(pipe, procSeccompDone); err != nil { - unix.Close(seccompFd) - return fmt.Errorf("sync parent seccomp: %w", err) - } - - if err := unix.Close(seccompFd); err != nil { - return fmt.Errorf("close seccomp fd: %w", err) - } - - return nil + // Wait for parent to tell us they've grabbed the seccompfd. + return readSync(pipe, procSeccompDone) } // setupUser changes the groups, gid, and uid for the user inside the container diff --git a/libcontainer/integration/execin_test.go b/libcontainer/integration/execin_test.go index 58519a419c0..c5c324130c6 100644 --- a/libcontainer/integration/execin_test.go +++ b/libcontainer/integration/execin_test.go @@ -277,9 +277,9 @@ func TestExecInTTY(t *testing.T) { done := make(chan (error)) go func() { - f, err := utils.RecvFd(parent) + f, err := utils.RecvFile(parent) if err != nil { - done <- fmt.Errorf("RecvFd: %w", err) + done <- fmt.Errorf("RecvFile: %w", err) return } c, err := console.ConsoleFromFile(f) diff --git a/libcontainer/message_linux.go b/libcontainer/message_linux.go index 17db81a29f3..d78e0328a20 100644 --- a/libcontainer/message_linux.go +++ b/libcontainer/message_linux.go @@ -21,8 +21,6 @@ const ( RootlessEUIDAttr uint16 = 27287 UidmapPathAttr uint16 = 27288 GidmapPathAttr uint16 = 27289 - MountSourcesAttr uint16 = 27290 - IdmapSourcesAttr uint16 = 27291 ) type Int32msg struct { diff --git a/libcontainer/mount_linux.go b/libcontainer/mount_linux.go index da11da68ea0..ce71f63ad68 100644 --- a/libcontainer/mount_linux.go +++ b/libcontainer/mount_linux.go @@ -1,22 +1,47 @@ package libcontainer import ( + "errors" + "fmt" "io/fs" + "os" "strconv" + "github.com/sirupsen/logrus" "golang.org/x/sys/unix" + + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/userns" +) + +// mountSourceType indicates what type of file descriptor is being returned. It +// is used to tell rootfs_linux.go whether or not to use move_mount(2) to +// install the mount. +type mountSourceType string + +const ( + // An open_tree(2)-style file descriptor that needs to be installed using + // move_mount(2) to install. + mountSourceOpenTree mountSourceType = "open_tree" + // A plain file descriptor that can be mounted through /proc/self/fd. + mountSourcePlain mountSourceType = "plain-open" ) +type mountSource struct { + Type mountSourceType `json:"source_type"` + file *os.File `json:"-"` +} + // mountError holds an error from a failed mount or unmount operation. type mountError struct { - op string - source string - srcFD *int - target string - dstFD string - flags uintptr - data string - err error + op string + source string + srcFile *mountSource + target string + dstFd string + flags uintptr + data string + err error } // Error provides a string error representation. @@ -25,13 +50,14 @@ func (e *mountError) Error() string { if e.source != "" { out += "src=" + e.source + ", " - if e.srcFD != nil { - out += "srcFD=" + strconv.Itoa(*e.srcFD) + ", " + if e.srcFile != nil { + out += "srcType=" + string(e.srcFile.Type) + ", " + out += "srcFd=" + strconv.Itoa(int(e.srcFile.file.Fd())) + ", " } } out += "dst=" + e.target - if e.dstFD != "" { - out += ", dstFD=" + e.dstFD + if e.dstFd != "" { + out += ", dstFd=" + e.dstFd } if e.flags != uintptr(0) { @@ -54,38 +80,58 @@ func (e *mountError) Unwrap() error { // mount is a simple unix.Mount wrapper, returning an error with more context // in case it failed. func mount(source, target, fstype string, flags uintptr, data string) error { - return mountViaFDs(source, nil, target, "", fstype, flags, data) + return mountViaFds(source, nil, target, "", fstype, flags, data) } -// mountViaFDs is a unix.Mount wrapper which uses srcFD instead of source, -// and dstFD instead of target, unless those are empty. -// If srcFD is different than nil, its path (i.e. "/proc/self/fd/NN") will be -// constructed by this function. -// dstFD argument, if non-empty, is expected to be in the form of a path to an -// opened file descriptor on procfs (i.e. "/proc/self/fd/NN"). +// mountViaFds is a unix.Mount wrapper which uses srcFile instead of source, +// and dstFd instead of target, unless those are empty. // -// If case an FD is used instead of a source or a target path, the -// corresponding path is only used to add context to an error in case -// the mount operation has failed. -func mountViaFDs(source string, srcFD *int, target, dstFD, fstype string, flags uintptr, data string) error { - src := source - if srcFD != nil { - src = "/proc/self/fd/" + strconv.Itoa(*srcFD) +// If srcFile is non-nil and flags does not contain MS_REMOUNT, mountViaFds +// will mount it according to the mountSourceType of the file descriptor. +// +// The dstFd argument, if non-empty, is expected to be in the form of a path to +// an opened file descriptor on procfs (i.e. "/proc/self/fd/NN"). +// +// If a file descriptor is used instead of a source or a target path, the +// corresponding path is only used to add context to an error in case the mount +// operation has failed. +func mountViaFds(source string, srcFile *mountSource, target, dstFd, fstype string, flags uintptr, data string) error { + // MS_REMOUNT and srcFile don't make sense together. + if srcFile != nil && flags&unix.MS_REMOUNT != 0 { + logrus.Debugf("mount source passed along with MS_REMOUNT -- ignoring srcFile") + srcFile = nil } + dst := target - if dstFD != "" { - dst = dstFD + if dstFd != "" { + dst = dstFd + } + src := source + if srcFile != nil { + src = "/proc/self/fd/" + strconv.Itoa(int(srcFile.file.Fd())) } - if err := unix.Mount(src, dst, fstype, flags, data); err != nil { + + var op string + var err error + if srcFile != nil && srcFile.Type == mountSourceOpenTree { + op = "move_mount" + err = unix.MoveMount(int(srcFile.file.Fd()), "", + unix.AT_FDCWD, dstFd, + unix.MOVE_MOUNT_F_EMPTY_PATH|unix.MOVE_MOUNT_T_SYMLINKS) + } else { + op = "mount" + err = unix.Mount(src, dst, fstype, flags, data) + } + if err != nil { return &mountError{ - op: "mount", - source: source, - srcFD: srcFD, - target: target, - dstFD: dstFD, - flags: flags, - data: data, - err: err, + op: op, + source: source, + srcFile: srcFile, + target: target, + dstFd: dstFd, + flags: flags, + data: data, + err: err, } } return nil @@ -121,3 +167,75 @@ func syscallMode(i fs.FileMode) (o uint32) { // No mapping for Go's ModeTemporary (plan9 only). return } + +// mountFd creates an open_tree(2)-like mount fd from the provided +// configuration. This function must be called from within the container's +// mount namespace. +func mountFd(nsHandles *userns.Handles, m *configs.Mount) (mountSource, error) { + if !m.IsBind() { + return mountSource{}, errors.New("new mount api: only bind-mounts are supported") + } + if nsHandles == nil { + nsHandles = new(userns.Handles) + defer nsHandles.Release() + } + + var mountFile *os.File + var sourceType mountSourceType + + if m.IsBind() { + // We only need to use OPEN_TREE_CLONE in the case where we need to use + // mount_setattr(2). We are currently in the container namespace and + // there is no risk of an opened directory being used to escape the + // container. OPEN_TREE_CLONE is more expensive than open(2) because it + // requires doing mounts inside a new anonymous mount namespace. + if m.IsIDMapped() { + flags := uint(unix.OPEN_TREE_CLONE | unix.O_CLOEXEC) + if m.Flags&unix.MS_REC == unix.MS_REC { + flags |= unix.AT_RECURSIVE + } + mountFd, err := unix.OpenTree(unix.AT_FDCWD, m.Source, flags) + if err != nil { + return mountSource{}, &os.PathError{Op: "open_tree(OPEN_TREE_CLONE)", Path: m.Source, Err: err} + } + mountFile = os.NewFile(uintptr(mountFd), m.Source) + sourceType = mountSourceOpenTree + } else { + var err error + mountFile, err = os.OpenFile(m.Source, unix.O_PATH|unix.O_CLOEXEC, 0) + if err != nil { + return mountSource{}, err + } + sourceType = mountSourcePlain + } + } + + if m.IsIDMapped() { + if mountFile == nil { + return mountSource{}, fmt.Errorf("invalid mount source %q: id-mapping of non-bind-mounts is not supported", m.Source) + } + if sourceType != mountSourceOpenTree { + // should never happen + return mountSource{}, fmt.Errorf("invalid mount source %q: id-mapped target mistakenly opened without OPEN_TREE_CLONE", m.Source) + } + + usernsFile, err := nsHandles.Get(userns.Mapping{ + UIDMappings: m.UIDMappings, + GIDMappings: m.GIDMappings, + }) + if err != nil { + return mountSource{}, fmt.Errorf("failed to create userns for %s id-mapping: %w", m.Source, err) + } + defer usernsFile.Close() + if err := unix.MountSetattr(int(mountFile.Fd()), "", unix.AT_EMPTY_PATH, &unix.MountAttr{ + Attr_set: unix.MOUNT_ATTR_IDMAP, + Userns_fd: uint64(usernsFile.Fd()), + }); err != nil { + return mountSource{}, fmt.Errorf("failed to set IDMAP_SOURCE_ATTR on %s: %w", m.Source, err) + } + } + return mountSource{ + Type: sourceType, + file: mountFile, + }, nil +} diff --git a/libcontainer/nsenter/cloned_binary.c b/libcontainer/nsenter/cloned_binary.c deleted file mode 100644 index a7f992fddd7..00000000000 --- a/libcontainer/nsenter/cloned_binary.c +++ /dev/null @@ -1,567 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 OR LGPL-2.1-or-later -/* - * Copyright (C) 2019 Aleksa Sarai - * Copyright (C) 2019 SUSE LLC - * - * This work is dual licensed under the following licenses. You may use, - * redistribute, and/or modify the work under the conditions of either (or - * both) licenses. - * - * === Apache-2.0 === - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - * === LGPL-2.1-or-later === - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library. If not, see - * . - * - */ - -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "ipc.h" -#include "log.h" - -/* Use our own wrapper for memfd_create. */ -#ifndef SYS_memfd_create -# ifdef __NR_memfd_create -# define SYS_memfd_create __NR_memfd_create -# else -/* These values come from . */ -# warning "libc is outdated -- using hard-coded SYS_memfd_create" -# if defined(__x86_64__) -# define SYS_memfd_create 319 -# elif defined(__i386__) -# define SYS_memfd_create 356 -# elif defined(__ia64__) -# define SYS_memfd_create 1340 -# elif defined(__arm__) -# define SYS_memfd_create 385 -# elif defined(__aarch64__) -# define SYS_memfd_create 279 -# elif defined(__ppc__) || defined(__PPC64__) || defined(__powerpc64__) -# define SYS_memfd_create 360 -# elif defined(__s390__) || defined(__s390x__) -# define SYS_memfd_create 350 -# else -# warning "unknown architecture -- cannot hard-code SYS_memfd_create" -# endif -# endif -#endif - -/* memfd_create(2) flags -- copied from . */ -#ifndef MFD_CLOEXEC -# define MFD_CLOEXEC 0x0001U -# define MFD_ALLOW_SEALING 0x0002U -#endif -#ifndef MFD_EXEC -# define MFD_EXEC 0x0010U -#endif - -int memfd_create(const char *name, unsigned int flags) -{ -#ifdef SYS_memfd_create - return syscall(SYS_memfd_create, name, flags); -#else - errno = ENOSYS; - return -1; -#endif -} - -/* This comes directly from . */ -#ifndef F_LINUX_SPECIFIC_BASE -# define F_LINUX_SPECIFIC_BASE 1024 -#endif -#ifndef F_ADD_SEALS -# define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9) -# define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10) -#endif -#ifndef F_SEAL_SEAL -# define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */ -# define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */ -# define F_SEAL_GROW 0x0004 /* prevent file from growing */ -# define F_SEAL_WRITE 0x0008 /* prevent writes */ -#endif -#ifndef F_SEAL_FUTURE_WRITE -# define F_SEAL_FUTURE_WRITE 0x0010 /* prevent future writes while mapped */ -#endif -#ifndef F_SEAL_EXEC -# define F_SEAL_EXEC 0x0020 /* prevent chmod modifying exec bits */ -#endif - -#define CLONED_BINARY_ENV "_LIBCONTAINER_CLONED_BINARY" -#define RUNC_MEMFD_COMMENT "runc_cloned:/proc/self/exe" -/* - * There are newer memfd seals (such as F_SEAL_FUTURE_WRITE and F_SEAL_EXEC), - * which we use opportunistically. However, this set is the original set of - * memfd seals, and we require them all to be set to trust our /proc/self/exe - * if it is a memfd. - */ -#define RUNC_MEMFD_MIN_SEALS \ - (F_SEAL_SEAL | F_SEAL_SHRINK | F_SEAL_GROW | F_SEAL_WRITE) - -static void *must_realloc(void *ptr, size_t size) -{ - void *old = ptr; - do { - ptr = realloc(old, size); - } while (!ptr); - return ptr; -} - -/* - * Verify whether we are currently in a self-cloned program (namely, is - * /proc/self/exe a memfd). F_GET_SEALS will only succeed for memfds (or rather - * for shmem files), and we want to be sure it's actually sealed. - */ -static int is_self_cloned(void) -{ - int fd, seals = 0, is_cloned = false; - struct stat statbuf = { }; - struct statfs fsbuf = { }; - - fd = open("/proc/self/exe", O_RDONLY | O_CLOEXEC); - if (fd < 0) { - write_log(ERROR, "cannot open runc binary for reading: open /proc/self/exe: %m"); - return -ENOTRECOVERABLE; - } - - /* - * Is the binary a fully-sealed memfd? We don't need CLONED_BINARY_ENV for - * this, because you cannot write to a sealed memfd no matter what. - */ - seals = fcntl(fd, F_GET_SEALS); - if (seals >= 0) { - write_log(DEBUG, "checking /proc/self/exe memfd seals: 0x%x", seals); - is_cloned = (seals & RUNC_MEMFD_MIN_SEALS) == RUNC_MEMFD_MIN_SEALS; - if (is_cloned) - goto out; - } - - /* - * All other forms require CLONED_BINARY_ENV, since they are potentially - * writeable (or we can't tell if they're fully safe) and thus we must - * check the environment as an extra layer of defence. - */ - if (!getenv(CLONED_BINARY_ENV)) { - is_cloned = false; - goto out; - } - - /* - * Is the binary on a read-only filesystem? We can't detect bind-mounts in - * particular (in-kernel they are identical to regular mounts) but we can - * at least be sure that it's read-only. In addition, to make sure that - * it's *our* bind-mount we check CLONED_BINARY_ENV. - */ - if (fstatfs(fd, &fsbuf) >= 0) - is_cloned |= (fsbuf.f_flags & MS_RDONLY); - - /* - * Okay, we're a tmpfile -- or we're currently running on RHEL <=7.6 - * which appears to have a borked backport of F_GET_SEALS. Either way, - * having a file which has no hardlinks indicates that we aren't using - * a host-side "runc" binary and this is something that a container - * cannot fake (because unlinking requires being able to resolve the - * path that you want to unlink). - */ - if (fstat(fd, &statbuf) >= 0) - is_cloned |= (statbuf.st_nlink == 0); - -out: - close(fd); - return is_cloned; -} - -/* Read a given file into a new buffer, and providing the length. */ -static char *read_file(char *path, size_t *length) -{ - int fd; - char buf[4096], *copy = NULL; - - if (!length) - return NULL; - - fd = open(path, O_RDONLY | O_CLOEXEC); - if (fd < 0) - return NULL; - - *length = 0; - for (;;) { - ssize_t n; - - n = read(fd, buf, sizeof(buf)); - if (n < 0) - goto error; - if (!n) - break; - - copy = must_realloc(copy, (*length + n) * sizeof(*copy)); - memcpy(copy + *length, buf, n); - *length += n; - } - close(fd); - return copy; - -error: - close(fd); - free(copy); - return NULL; -} - -/* - * A poor-man's version of "xargs -0". Basically parses a given block of - * NUL-delimited data, within the given length and adds a pointer to each entry - * to the array of pointers. - */ -static int parse_xargs(char *data, int data_length, char ***output) -{ - int num = 0; - char *cur = data; - - if (!data || *output != NULL) - return -1; - - while (cur < data + data_length) { - num++; - *output = must_realloc(*output, (num + 1) * sizeof(**output)); - (*output)[num - 1] = cur; - cur += strlen(cur) + 1; - } - (*output)[num] = NULL; - return num; -} - -/* - * "Parse" out argv from /proc/self/cmdline. - * This is necessary because we are running in a context where we don't have a - * main() that we can just get the arguments from. - */ -static int fetchve(char ***argv) -{ - char *cmdline = NULL; - size_t cmdline_size; - - cmdline = read_file("/proc/self/cmdline", &cmdline_size); - if (!cmdline) - goto error; - - if (parse_xargs(cmdline, cmdline_size, argv) <= 0) - goto error; - - return 0; - -error: - free(cmdline); - return -EINVAL; -} - -enum { - EFD_NONE = 0, - EFD_MEMFD, - EFD_FILE, -}; - -/* - * This comes from . We can't hard-code __O_TMPFILE because it - * changes depending on the architecture. If we don't have O_TMPFILE we always - * have the mkostemp(3) fallback. - */ -#ifndef O_TMPFILE -# if defined(__O_TMPFILE) && defined(O_DIRECTORY) -# define O_TMPFILE (__O_TMPFILE | O_DIRECTORY) -# endif -#endif - -static inline bool is_memfd_unsupported_error(int err) -{ - /* - * - ENOSYS is obviously an "unsupported" error. - * - * - EINVAL could be hit if MFD_EXEC is not supported (pre-6.3 kernel), - * but it can also be hit if vm.memfd_noexec=2 (in kernels without - * [1] applied) and the flags does not contain MFD_EXEC. However, - * there was a bug in the original 6.3 implementation of - * vm.memfd_noexec=2, which meant that MFD_EXEC would work even in - * the "strict" mode. Because we try MFD_EXEC first, we won't get - * EINVAL in the vm.memfd_noexec=2 case (which means we don't need to - * figure out whether to log the message about memfd_create). - * - * - EACCES is returned in kernels that contain [1] in the - * vm.memfd_noexec=2 case. - * - * At time of writing, [1] is not in Linus's tree and it't not clear if - * it will be backported to stable, so what exact versions apply here - * is unclear. But the bug is present in 6.3-6.5 at the very least. - * - * [1]: https://lore.kernel.org/all/20230705063315.3680666-2-jeffxu@google.com/ - */ - if (err == EACCES) - write_log(INFO, - "memfd_create(MFD_EXEC) failed, possibly due to vm.memfd_noexec=2 -- falling back to less secure O_TMPFILE"); - return err == ENOSYS || err == EINVAL || err == EACCES; -} - -static int make_execfd(int *fdtype) -{ - int fd = -1; - char template[PATH_MAX] = { 0 }; - char *prefix = getenv("_LIBCONTAINER_STATEDIR"); - - if (!prefix || *prefix != '/') - prefix = "/tmp"; - if (snprintf(template, sizeof(template), "%s/runc.XXXXXX", prefix) < 0) - return -1; - - /* - * Now try memfd, it's much nicer than actually creating a file in STATEDIR - * since it's easily detected thanks to sealing and also doesn't require - * assumptions about STATEDIR. - */ - *fdtype = EFD_MEMFD; - /* - * On newer kernels we should set MFD_EXEC to indicate we need +x - * permissions. Otherwise an admin with vm.memfd_noexec=1 would subtly - * break runc. vm.memfd_noexec=2 is a little bit more complicated, see the - * comment in is_memfd_unsupported_error() -- the upshot is that doing it - * this way works, but only because of two overlapping bugs in the sysctl - * implementation. - */ - fd = memfd_create(RUNC_MEMFD_COMMENT, MFD_EXEC | MFD_CLOEXEC | MFD_ALLOW_SEALING); - if (fd < 0 && is_memfd_unsupported_error(errno)) - fd = memfd_create(RUNC_MEMFD_COMMENT, MFD_CLOEXEC | MFD_ALLOW_SEALING); - if (fd >= 0) - return fd; - if (!is_memfd_unsupported_error(errno)) - goto error; - -#ifdef O_TMPFILE - /* - * Try O_TMPFILE to avoid races where someone might snatch our file. Note - * that O_EXCL isn't actually a security measure here (since you can just - * fd re-open it and clear O_EXCL). - */ - *fdtype = EFD_FILE; - fd = open(prefix, O_TMPFILE | O_EXCL | O_RDWR | O_CLOEXEC, 0700); - if (fd >= 0) { - struct stat statbuf = { }; - bool working_otmpfile = false; - - /* - * open(2) ignores unknown O_* flags -- yeah, I was surprised when I - * found this out too. As a result we can't check for EINVAL. However, - * if we get nlink != 0 (or EISDIR) then we know that this kernel - * doesn't support O_TMPFILE. - */ - if (fstat(fd, &statbuf) >= 0) - working_otmpfile = (statbuf.st_nlink == 0); - - if (working_otmpfile) - return fd; - - /* Pretend that we got EISDIR since O_TMPFILE failed. */ - close(fd); - errno = EISDIR; - } - if (errno != EISDIR) - goto error; -#endif /* defined(O_TMPFILE) */ - - /* - * Our final option is to create a temporary file the old-school way, and - * then unlink it so that nothing else sees it by accident. - */ - *fdtype = EFD_FILE; - fd = mkostemp(template, O_CLOEXEC); - if (fd >= 0) { - if (unlink(template) >= 0) - return fd; - close(fd); - } - -error: - *fdtype = EFD_NONE; - return -1; -} - -static int seal_execfd(int *fd, int fdtype) -{ - switch (fdtype) { - case EFD_MEMFD:{ - /* - * Try to seal with newer seals, but we ignore errors because older - * kernels don't support some of them. For container security only - * RUNC_MEMFD_MIN_SEALS are strictly required, but the rest are - * nice-to-haves. We apply RUNC_MEMFD_MIN_SEALS at the end because it - * contains F_SEAL_SEAL. - */ - int __attribute__((unused)) _err1 = fcntl(*fd, F_ADD_SEALS, F_SEAL_FUTURE_WRITE); // Linux 5.1 - int __attribute__((unused)) _err2 = fcntl(*fd, F_ADD_SEALS, F_SEAL_EXEC); // Linux 6.3 - return fcntl(*fd, F_ADD_SEALS, RUNC_MEMFD_MIN_SEALS); - } - case EFD_FILE:{ - /* Need to re-open our pseudo-memfd as an O_PATH to avoid execve(2) giving -ETXTBSY. */ - int newfd; - char fdpath[PATH_MAX] = { 0 }; - - if (fchmod(*fd, 0100) < 0) - return -1; - - if (snprintf(fdpath, sizeof(fdpath), "/proc/self/fd/%d", *fd) < 0) - return -1; - - newfd = open(fdpath, O_PATH | O_CLOEXEC); - if (newfd < 0) - return -1; - - close(*fd); - *fd = newfd; - return 0; - } - default: - break; - } - return -1; -} - -static ssize_t fd_to_fd(int outfd, int infd) -{ - ssize_t total = 0; - char buffer[4096]; - - for (;;) { - ssize_t nread, nwritten = 0; - - nread = read(infd, buffer, sizeof(buffer)); - if (nread < 0) - return -1; - if (!nread) - break; - - do { - ssize_t n = write(outfd, buffer + nwritten, nread - nwritten); - if (n < 0) - return -1; - nwritten += n; - } while (nwritten < nread); - - total += nwritten; - } - - return total; -} - -static int clone_binary(void) -{ - int binfd, execfd; - struct stat statbuf = { }; - size_t sent = 0; - int fdtype = EFD_NONE; - - execfd = make_execfd(&fdtype); - if (execfd < 0 || fdtype == EFD_NONE) - return -ENOTRECOVERABLE; - - binfd = open("/proc/self/exe", O_RDONLY | O_CLOEXEC); - if (binfd < 0) - goto error; - - if (fstat(binfd, &statbuf) < 0) - goto error_binfd; - - while (sent < statbuf.st_size) { - int n = sendfile(execfd, binfd, NULL, statbuf.st_size - sent); - if (n < 0) { - /* sendfile can fail so we fallback to a dumb user-space copy. */ - n = fd_to_fd(execfd, binfd); - if (n < 0) - goto error_binfd; - } - sent += n; - } - close(binfd); - if (sent != statbuf.st_size) - goto error; - - if (seal_execfd(&execfd, fdtype) < 0) - goto error; - - return execfd; - -error_binfd: - close(binfd); -error: - close(execfd); - return -EIO; -} - -/* Get cheap access to the environment. */ -extern char **environ; - -int ensure_cloned_binary(void) -{ - int execfd; - char **argv = NULL; - - /* Check that we're not self-cloned, and if we are then bail. */ - int cloned = is_self_cloned(); - if (cloned > 0 || cloned == -ENOTRECOVERABLE) - return cloned; - - if (fetchve(&argv) < 0) - return -EINVAL; - - execfd = clone_binary(); - if (execfd < 0) - return -EIO; - - if (putenv(CLONED_BINARY_ENV "=1")) - goto error; - - fexecve(execfd, argv, environ); -error: - close(execfd); - return -ENOEXEC; -} diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c index 22b6ea1cd21..45d5c291e4f 100644 --- a/libcontainer/nsenter/nsexec.c +++ b/libcontainer/nsenter/nsexec.c @@ -44,10 +44,6 @@ enum sync_t { SYNC_RECVPID_ACK = 0x43, /* PID was correctly received by parent. */ SYNC_GRANDCHILD = 0x44, /* The grandchild is ready to run. */ SYNC_CHILD_FINISH = 0x45, /* The child or grandchild has finished. */ - SYNC_MOUNTSOURCES_PLS = 0x46, /* Tell parent to send mount sources by SCM_RIGHTS. */ - SYNC_MOUNTSOURCES_ACK = 0x47, /* All mount sources have been sent. */ - SYNC_MOUNT_IDMAP_PLS = 0x48, /* Tell parent to mount idmap sources. */ - SYNC_MOUNT_IDMAP_ACK = 0x49, /* All idmap mounts have been done. */ }; #define STAGE_SETUP -1 @@ -96,14 +92,6 @@ struct nlconfig_t { size_t uidmappath_len; char *gidmappath; size_t gidmappath_len; - - /* Mount sources opened outside the container userns. */ - char *mountsources; - size_t mountsources_len; - - /* Idmap sources opened outside the container userns which will be id mapped. */ - char *idmapsources; - size_t idmapsources_len; }; /* @@ -120,8 +108,6 @@ struct nlconfig_t { #define ROOTLESS_EUID_ATTR 27287 #define UIDMAPPATH_ATTR 27288 #define GIDMAPPATH_ATTR 27289 -#define MOUNT_SOURCES_ATTR 27290 -#define IDMAP_SOURCES_ATTR 27291 /* * Use the raw syscall for versions of glibc which don't include a function for @@ -437,14 +423,6 @@ static void nl_parse(int fd, struct nlconfig_t *config) case SETGROUP_ATTR: config->is_setgroup = readint8(current); break; - case MOUNT_SOURCES_ATTR: - config->mountsources = current; - config->mountsources_len = payload_len; - break; - case IDMAP_SOURCES_ATTR: - config->idmapsources = current; - config->idmapsources_len = payload_len; - break; default: bail("unknown netlink message type %d", nlattr->nla_type); } @@ -525,9 +503,6 @@ void join_namespaces(char *nslist) free(namespaces); } -/* Defined in cloned_binary.c. */ -extern int ensure_cloned_binary(void); - static inline int sane_kill(pid_t pid, int signum) { if (pid > 0) @@ -536,115 +511,6 @@ static inline int sane_kill(pid_t pid, int signum) return 0; } -/* receive_fd_sources parses env_var as an array of fd numbers and, for each element that is - * not -1, it receives an fd via SCM_RIGHTS and dup3 it to the fd requested in - * the element of the env var. - */ -void receive_fd_sources(int sockfd, const char *env_var) -{ - char *fds, *endp; - long new_fd; - - // This env var must be a json array of ints. - fds = getenv(env_var); - - if (fds[0] != '[') { - bail("malformed %s env var: missing '['", env_var); - } - fds++; - - for (endp = fds; *endp != ']'; fds = endp + 1) { - new_fd = strtol(fds, &endp, 10); - if (endp == fds) { - bail("malformed %s env var: not a number", env_var); - } - if (*endp == '\0') { - bail("malformed %s env var: missing ]", env_var); - } - // The list contains -1 when no fd is needed. Ignore them. - if (new_fd == -1) { - continue; - } - - if (new_fd == LONG_MAX || new_fd < 0 || new_fd > INT_MAX) { - bail("malformed %s env var: fds out of range", env_var); - } - - int recv_fd = receive_fd(sockfd); - if (dup3(recv_fd, new_fd, O_CLOEXEC) < 0) { - bail("cannot dup3 fd %d to %ld", recv_fd, new_fd); - } - if (close(recv_fd) < 0) { - bail("cannot close fd %d", recv_fd); - } - } -} - -void receive_mountsources(int sockfd) -{ - receive_fd_sources(sockfd, "_LIBCONTAINER_MOUNT_FDS"); -} - -void send_mountsources(int sockfd, pid_t child, char *mountsources, size_t mountsources_len) -{ - char proc_path[PATH_MAX]; - int host_mntns_fd; - int container_mntns_fd; - int fd; - int ret; - - // container_linux.go shouldSendMountSources() decides if mount sources - // should be pre-opened (O_PATH) and passed via SCM_RIGHTS - if (mountsources == NULL) - return; - - host_mntns_fd = open("/proc/self/ns/mnt", O_RDONLY | O_CLOEXEC); - if (host_mntns_fd == -1) - bail("failed to get current mount namespace"); - - if (snprintf(proc_path, PATH_MAX, "/proc/%d/ns/mnt", child) < 0) - bail("failed to get mount namespace path"); - - container_mntns_fd = open(proc_path, O_RDONLY | O_CLOEXEC); - if (container_mntns_fd == -1) - bail("failed to get container mount namespace"); - - if (setns(container_mntns_fd, CLONE_NEWNS) < 0) - bail("failed to setns to container mntns"); - - char *mountsources_end = mountsources + mountsources_len; - while (mountsources < mountsources_end) { - if (mountsources[0] == '\0') { - mountsources++; - continue; - } - - fd = open(mountsources, O_PATH | O_CLOEXEC); - if (fd < 0) - bail("failed to open mount source %s", mountsources); - - write_log(DEBUG, "~> sending fd for: %s", mountsources); - if (send_fd(sockfd, fd) < 0) - bail("failed to send fd %d via unix socket %d", fd, sockfd); - - ret = close(fd); - if (ret != 0) - bail("failed to close mount source fd %d", fd); - - mountsources += strlen(mountsources) + 1; - } - - if (setns(host_mntns_fd, CLONE_NEWNS) < 0) - bail("failed to setns to host mntns"); - - ret = close(host_mntns_fd); - if (ret != 0) - bail("failed to close host mount namespace fd %d", host_mntns_fd); - ret = close(container_mntns_fd); - if (ret != 0) - bail("failed to close container mount namespace fd %d", container_mntns_fd); -} - void try_unshare(int flags, const char *msg) { write_log(DEBUG, "unshare %s", msg); @@ -664,89 +530,6 @@ void try_unshare(int flags, const char *msg) bail("failed to unshare %s", msg); } -void send_idmapsources(int sockfd, pid_t pid, char *idmap_src, int idmap_src_len) -{ - char proc_user_path[PATH_MAX]; - - /* Open the userns fd only once. - * Currently we only support idmap mounts that use the same mapping than - * the userns. This is validated in libcontainer/configs/validate/validator.go, - * so if we reached here, we know the mapping for the idmap is the same - * as the userns. This is why we just open the userns_fd once from the - * PID of the child process that has the userns already applied. - */ - int ret = snprintf(proc_user_path, sizeof(proc_user_path), "/proc/%d/ns/user", pid); - if (ret < 0 || (size_t)ret >= sizeof(proc_user_path)) { - sane_kill(pid, SIGKILL); - bail("failed to create userns path string"); - } - - int userns_fd = open(proc_user_path, O_RDONLY | O_CLOEXEC | O_NOCTTY); - if (userns_fd < 0) { - sane_kill(pid, SIGKILL); - bail("failed to get user namespace fd"); - } - - char *idmap_end = idmap_src + idmap_src_len; - while (idmap_src < idmap_end) { - if (idmap_src[0] == '\0') { - idmap_src++; - continue; - } - - int fd_tree = sys_open_tree(-EBADF, idmap_src, - OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC | - AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT); - if (fd_tree < 0) { - sane_kill(pid, SIGKILL); - if (errno == ENOSYS) { - bail("open_tree(2) failed, the kernel doesn't support ID-mapped mounts"); - } else if (errno == EINVAL) { - bail("open_tree(2) failed with path: %s, the kernel doesn't support ID-mapped mounts", - idmap_src); - } else { - bail("open_tree(2) failed with path: %s", idmap_src); - } - } - - struct mount_attr attr = { - .attr_set = MOUNT_ATTR_IDMAP, - .userns_fd = userns_fd, - }; - - ret = sys_mount_setattr(fd_tree, "", AT_EMPTY_PATH, &attr, sizeof(attr)); - if (ret < 0) { - sane_kill(pid, SIGKILL); - if (errno == ENOSYS) - bail("mount_setattr(2) failed, the kernel doesn't support ID-mapped mounts"); - else if (errno == EINVAL) - bail("mount_setattr(2) failed with path: %s, maybe the filesystem doesn't support ID-mapped mounts", idmap_src); - else - bail("mount_setattr(2) failed with path: %s", idmap_src); - } - - write_log(DEBUG, "~> sending idmap source: %s with mapping from: %s", idmap_src, proc_user_path); - send_fd(sockfd, fd_tree); - - if (close(fd_tree) < 0) { - sane_kill(pid, SIGKILL); - bail("error closing fd_tree"); - } - - idmap_src += strlen(idmap_src) + 1; - } - - if (close(userns_fd) < 0) { - sane_kill(pid, SIGKILL); - bail("error closing userns fd"); - } -} - -void receive_idmapsources(int sockfd) -{ - receive_fd_sources(sockfd, "_LIBCONTAINER_IDMAP_FDS"); -} - void nsexec(void) { int pipenum; @@ -771,14 +554,6 @@ void nsexec(void) return; } - /* - * We need to re-exec if we are not in a cloned binary. This is necessary - * to ensure that containers won't be able to access the host binary - * through /proc/self/exe. See CVE-2019-5736. - */ - if (ensure_cloned_binary() < 0) - bail("could not ensure we are a cloned binary"); - /* * Inform the parent we're past initial setup. * For the other side of this, see initWaiter. @@ -977,28 +752,6 @@ void nsexec(void) sane_kill(stage2_pid, SIGKILL); bail("failed to sync with runc: write(pid-JSON)"); } - break; - case SYNC_MOUNTSOURCES_PLS: - write_log(DEBUG, "stage-1 requested to open mount sources"); - send_mountsources(syncfd, stage1_pid, config.mountsources, - config.mountsources_len); - - s = SYNC_MOUNTSOURCES_ACK; - if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { - sane_kill(stage1_pid, SIGKILL); - bail("failed to sync with child: write(SYNC_MOUNTSOURCES_ACK)"); - } - break; - case SYNC_MOUNT_IDMAP_PLS: - write_log(DEBUG, "stage-1 requested to open idmap sources"); - send_idmapsources(syncfd, stage1_pid, config.idmapsources, - config.idmapsources_len); - s = SYNC_MOUNT_IDMAP_ACK; - if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { - sane_kill(stage1_pid, SIGKILL); - bail("failed to sync with child: write(SYNC_MOUNT_IDMAP_ACK)"); - } - break; case SYNC_CHILD_FINISH: write_log(DEBUG, "stage-1 complete"); @@ -1151,39 +904,7 @@ void nsexec(void) * some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID) * was broken, so we'll just do it the long way anyway. */ - try_unshare(config.cloneflags & ~CLONE_NEWCGROUP, "remaining namespaces (except cgroupns)"); - - /* Ask our parent to send the mount sources fds. */ - if (config.mountsources) { - write_log(DEBUG, "request stage-0 to send mount sources"); - s = SYNC_MOUNTSOURCES_PLS; - if (write(syncfd, &s, sizeof(s)) != sizeof(s)) - bail("failed to sync with parent: write(SYNC_MOUNTSOURCES_PLS)"); - - /* Receive and install all mount sources fds. */ - receive_mountsources(syncfd); - - /* Parent finished to send the mount sources fds. */ - if (read(syncfd, &s, sizeof(s)) != sizeof(s)) - bail("failed to sync with parent: read(SYNC_MOUNTSOURCES_ACK)"); - if (s != SYNC_MOUNTSOURCES_ACK) - bail("failed to sync with parent: SYNC_MOUNTSOURCES_ACK: got %u", s); - } - - if (config.idmapsources) { - write_log(DEBUG, "request stage-0 to send idmap sources"); - s = SYNC_MOUNT_IDMAP_PLS; - if (write(syncfd, &s, sizeof(s)) != sizeof(s)) - bail("failed to sync with parent: write(SYNC_MOUNT_IDMAP_PLS)"); - - /* Receive and install all idmap fds. */ - receive_idmapsources(syncfd); - - if (read(syncfd, &s, sizeof(s)) != sizeof(s)) - bail("failed to sync with parent: read(SYNC_MOUNT_IDMAP_ACK)"); - if (s != SYNC_MOUNT_IDMAP_ACK) - bail("failed to sync with parent: SYNC_MOUNT_IDMAP_ACK: got %u", s); - } + try_unshare(config.cloneflags, "remaining namespaces"); /* * TODO: What about non-namespace clone flags that we're dropping here? @@ -1281,10 +1002,6 @@ void nsexec(void) bail("setgroups failed"); } - if (config.cloneflags & CLONE_NEWCGROUP) { - try_unshare(CLONE_NEWCGROUP, "cgroup namespace"); - } - write_log(DEBUG, "signal completion to stage-0"); s = SYNC_CHILD_FINISH; if (write(syncfd, &s, sizeof(s)) != sizeof(s)) diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go index 48861406dba..fb5cdc432b6 100644 --- a/libcontainer/process_linux.go +++ b/libcontainer/process_linux.go @@ -1,6 +1,7 @@ package libcontainer import ( + "context" "encoding/json" "errors" "fmt" @@ -9,19 +10,22 @@ import ( "os" "os/exec" "path/filepath" + "runtime" "strconv" "time" + "github.com/opencontainers/runtime-spec/specs-go" + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" + "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/cgroups/fs2" "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/intelrdt" "github.com/opencontainers/runc/libcontainer/logs" "github.com/opencontainers/runc/libcontainer/system" + "github.com/opencontainers/runc/libcontainer/userns" "github.com/opencontainers/runc/libcontainer/utils" - "github.com/opencontainers/runtime-spec/specs-go" - "github.com/sirupsen/logrus" - "golang.org/x/sys/unix" ) type parentProcess interface { @@ -167,18 +171,34 @@ func (p *setnsProcess) start() (retErr error) { // This shouldn't happen. panic("unexpected procReady in setns") case procHooks: - // This shouldn't happen. + // this shouldn't happen. panic("unexpected procHooks in setns") + case procMountPlease: + // this shouldn't happen. + panic("unexpected procMountPlease in setns") case procSeccomp: if p.config.Config.Seccomp.ListenerPath == "" { - return errors.New("listenerPath is not set") + return errors.New("seccomp listenerPath is not set") } - - seccompFd, err := recvSeccompFd(uintptr(p.pid()), uintptr(sync.Fd)) + if sync.Arg == nil { + return fmt.Errorf("sync %q is missing an argument", sync.Type) + } + var srcFd int + if err := json.Unmarshal(*sync.Arg, &srcFd); err != nil { + return fmt.Errorf("sync %q passed invalid fd arg: %w", sync.Type, err) + } + seccompFd, err := pidGetFd(p.pid(), srcFd) if err != nil { + return fmt.Errorf("sync %q get fd %d from child failed: %w", sync.Type, srcFd, err) + } + defer seccompFd.Close() + // We have a copy, the child can keep working. We don't need to + // wait for the seccomp notify listener to get the fd before we + // permit the child to continue because the child will happily wait + // for the listener if it hits SCMP_ACT_NOTIFY. + if err := writeSync(p.messageSockPair.parent, procSeccompDone); err != nil { return err } - defer unix.Close(seccompFd) bundle, annotations := utils.Annotations(p.config.Config.Labels) containerProcessState := &specs.ContainerProcessState{ @@ -199,15 +219,10 @@ func (p *setnsProcess) start() (retErr error) { containerProcessState, seccompFd); err != nil { return err } - - // Sync with child. - if err := writeSync(p.messageSockPair.parent, procSeccompDone); err != nil { - return err - } - return nil default: return errors.New("invalid JSON payload from child") } + return nil }) if err := unix.Shutdown(int(p.messageSockPair.parent.Fd()), unix.SHUT_WR); err != nil { @@ -352,6 +367,95 @@ func (p *initProcess) waitForChildExit(childPid int) error { return nil } +type mountSourceRequestFn func(*configs.Mount) (mountSource, error) + +// goCreateMountSources spawns a goroutine which creates open_tree(2)-style +// mountfds based on the requested configs.Mount configuration. The returned +// requestFn and cancelFn are used to interact with the goroutine. +func (p *initProcess) goCreateMountSources(ctx context.Context) (mountSourceRequestFn, context.CancelFunc, error) { + type response struct { + src mountSource + err error + } + + errCh := make(chan error, 1) + requestCh := make(chan *configs.Mount) + responseCh := make(chan response) + + ctx, cancelFn := context.WithCancel(ctx) + go func() { + // We lock this thread because we need to setns(2) here. There is no + // UnlockOSThread() here, to ensure that the Go runtime will kill this + // thread once this goroutine returns (ensuring no other goroutines run + // in this context). + runtime.LockOSThread() + + // Detach from the shared fs of the rest of the Go process. + if err := unix.Unshare(unix.CLONE_FS); err != nil { + err = os.NewSyscallError("unshare(CLONE_FS)", err) + errCh <- fmt.Errorf("mount source thread: %w", err) + } + + // Attach to the container's mount namespace. + nsFd, err := os.Open(fmt.Sprintf("/proc/%d/ns/mnt", p.pid())) + if err != nil { + errCh <- fmt.Errorf("mount source thread: open container mntns: %w", err) + return + } + defer nsFd.Close() + if err := unix.Setns(int(nsFd.Fd()), unix.CLONE_NEWNS); err != nil { + err = os.NewSyscallError("setns", err) + errCh <- fmt.Errorf("mount source thread: join container mntns: %w", err) + return + } + + // No errors during setup! + errCh <- nil + logrus.Debugf("mount source thread: successfully running in container mntns") + + nsHandles := new(userns.Handles) + defer nsHandles.Release() + for { + select { + case m := <-requestCh: + src, err := mountFd(nsHandles, m) + logrus.Debugf("mount source thread: handling request for %q: %v %v", m.Source, src, err) + responseCh <- response{ + src: src, + err: err, + } + case <-ctx.Done(): + close(requestCh) + close(responseCh) + return + } + } + }() + + // Check for setup errors. + err := <-errCh + close(errCh) + if err != nil { + cancelFn() + return nil, nil, err + } + + requestFn := func(m *configs.Mount) (mountSource, error) { + select { + case requestCh <- m: + select { + case resp := <-responseCh: + return resp.src, resp.err + case <-ctx.Done(): + return mountSource{}, errors.New("receive mount source failed: channel closed") + } + case <-ctx.Done(): + return mountSource{}, errors.New("send mount source request failed: channel closed") + } + } + return requestFn, cancelFn, nil +} + func (p *initProcess) start() (retErr error) { defer p.messageSockPair.parent.Close() //nolint: errcheck err := p.cmd.Start() @@ -442,6 +546,17 @@ func (p *initProcess) start() (retErr error) { return fmt.Errorf("error waiting for our first child to exit: %w", err) } + // Spin up a goroutine to handle remapping mount requests by runc init. There is no point doing this for rootless containers becuase + var mountRequest mountSourceRequestFn + if !p.container.config.RootlessEUID { + request, cancel, err := p.goCreateMountSources(context.Background()) + if err != nil { + return fmt.Errorf("error spawning mount remapping thread: %w", err) + } + defer cancel() + mountRequest = request + } + if err := p.createNetworkInterfaces(); err != nil { return fmt.Errorf("error creating network interfaces: %w", err) } @@ -458,16 +573,58 @@ func (p *initProcess) start() (retErr error) { ierr := parseSync(p.messageSockPair.parent, func(sync *syncT) error { switch sync.Type { + case procMountPlease: + if mountRequest == nil { + return fmt.Errorf("cannot fulfil mount requests as a rootless user") + } + var m *configs.Mount + if sync.Arg == nil { + return fmt.Errorf("sync %q is missing an argument", sync.Type) + } + if err := json.Unmarshal(*sync.Arg, &m); err != nil { + return fmt.Errorf("sync %q passed invalid mount arg: %w", sync.Type, err) + } + mnt, err := mountRequest(m) + if err != nil { + return fmt.Errorf("failed to fulfil mount request: %w", err) + } + defer mnt.file.Close() + + arg, err := json.Marshal(mnt) + if err != nil { + return fmt.Errorf("sync %q failed to marshal mountSource: %w", sync.Type, err) + } + argMsg := json.RawMessage(arg) + if err := doWriteSync(p.messageSockPair.parent, syncT{ + Type: procMountFd, + Arg: &argMsg, + File: mnt.file, + }); err != nil { + return err + } case procSeccomp: if p.config.Config.Seccomp.ListenerPath == "" { - return errors.New("listenerPath is not set") + return errors.New("seccomp listenerPath is not set") } - - seccompFd, err := recvSeccompFd(uintptr(childPid), uintptr(sync.Fd)) + var srcFd int + if sync.Arg == nil { + return fmt.Errorf("sync %q is missing an argument", sync.Type) + } + if err := json.Unmarshal(*sync.Arg, &srcFd); err != nil { + return fmt.Errorf("sync %q passed invalid fd arg: %w", sync.Type, err) + } + seccompFd, err := pidGetFd(p.pid(), srcFd) if err != nil { + return fmt.Errorf("sync %q get fd %d from child failed: %w", sync.Type, srcFd, err) + } + defer seccompFd.Close() + // We have a copy, the child can keep working. We don't need to + // wait for the seccomp notify listener to get the fd before we + // permit the child to continue because the child will happily wait + // for the listener if it hits SCMP_ACT_NOTIFY. + if err := writeSync(p.messageSockPair.parent, procSeccompDone); err != nil { return err } - defer unix.Close(seccompFd) s, err := p.container.currentOCIState() if err != nil { @@ -488,11 +645,6 @@ func (p *initProcess) start() (retErr error) { containerProcessState, seccompFd); err != nil { return err } - - // Sync with child. - if err := writeSync(p.messageSockPair.parent, procSeccompDone); err != nil { - return err - } case procReady: // set rlimits, this has to be done here because we lose permissions // to raise the limits once we enter a user-namespace @@ -591,7 +743,6 @@ func (p *initProcess) start() (retErr error) { default: return errors.New("invalid JSON payload from child") } - return nil }) @@ -684,22 +835,20 @@ func (p *initProcess) forwardChildLogs() chan error { return logs.ForwardLogs(p.logFilePair.parent) } -func recvSeccompFd(childPid, childFd uintptr) (int, error) { - pidfd, _, errno := unix.Syscall(unix.SYS_PIDFD_OPEN, childPid, 0, 0) - if errno != 0 { - return -1, fmt.Errorf("performing SYS_PIDFD_OPEN syscall: %w", errno) +func pidGetFd(pid, srcFd int) (*os.File, error) { + pidFd, err := unix.PidfdOpen(pid, 0) + if err != nil { + return nil, os.NewSyscallError("pidfd_open", err) } - defer unix.Close(int(pidfd)) - - seccompFd, _, errno := unix.Syscall(unix.SYS_PIDFD_GETFD, pidfd, childFd, 0) - if errno != 0 { - return -1, fmt.Errorf("performing SYS_PIDFD_GETFD syscall: %w", errno) + defer unix.Close(pidFd) + fd, err := unix.PidfdGetfd(pidFd, srcFd, 0) + if err != nil { + return nil, os.NewSyscallError("pidfd_getfd", err) } - - return int(seccompFd), nil + return os.NewFile(uintptr(fd), "[pidfd_getfd]"), nil } -func sendContainerProcessState(listenerPath string, state *specs.ContainerProcessState, fd int) error { +func sendContainerProcessState(listenerPath string, state *specs.ContainerProcessState, file *os.File) error { conn, err := net.Dial("unix", listenerPath) if err != nil { return fmt.Errorf("failed to connect with seccomp agent specified in the seccomp profile: %w", err) @@ -716,11 +865,10 @@ func sendContainerProcessState(listenerPath string, state *specs.ContainerProces return fmt.Errorf("cannot marshall seccomp state: %w", err) } - err = utils.SendFds(socket, b, fd) - if err != nil { + if err := utils.SendRawFd(socket, string(b), file.Fd()); err != nil { return fmt.Errorf("cannot send seccomp fd to %s: %w", listenerPath, err) } - + runtime.KeepAlive(file) return nil } diff --git a/libcontainer/rootfs_linux.go b/libcontainer/rootfs_linux.go index ac3ba183e12..fb7e917832c 100644 --- a/libcontainer/rootfs_linux.go +++ b/libcontainer/rootfs_linux.go @@ -1,9 +1,9 @@ package libcontainer import ( + "encoding/json" "errors" "fmt" - "io" "os" "path" "path/filepath" @@ -14,16 +14,17 @@ import ( securejoin "github.com/cyphar/filepath-securejoin" "github.com/moby/sys/mountinfo" "github.com/mrunalp/fileutils" + "github.com/opencontainers/runtime-spec/specs-go" + "github.com/opencontainers/selinux/go-selinux/label" + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" + "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/cgroups/fs2" "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/devices" "github.com/opencontainers/runc/libcontainer/userns" "github.com/opencontainers/runc/libcontainer/utils" - "github.com/opencontainers/runtime-spec/specs-go" - "github.com/opencontainers/selinux/go-selinux/label" - "github.com/sirupsen/logrus" - "golang.org/x/sys/unix" ) const defaultMountFlags = unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV @@ -41,12 +42,12 @@ type mountConfig struct { // mountEntry contains mount data specific to a mount point. type mountEntry struct { *configs.Mount - srcFD *int + srcFile *mountSource } func (m *mountEntry) src() string { - if m.srcFD != nil { - return "/proc/self/fd/" + strconv.Itoa(*m.srcFD) + if m.srcFile != nil { + return "/proc/self/fd/" + strconv.Itoa(int(m.srcFile.file.Fd())) } return m.Source } @@ -64,20 +65,12 @@ func needsSetupDev(config *configs.Config) bool { // prepareRootfs sets up the devices, mount points, and filesystems for use // inside a new mount namespace. It doesn't set anything as ro. You must call // finalizeRootfs after this function to finish setting up the rootfs. -func prepareRootfs(pipe io.ReadWriter, iConfig *initConfig, mountFds mountFds) (err error) { +func prepareRootfs(pipe *os.File, iConfig *initConfig) (err error) { config := iConfig.Config if err := prepareRoot(config); err != nil { return fmt.Errorf("error preparing rootfs: %w", err) } - if mountFds.sourceFds != nil && len(mountFds.sourceFds) != len(config.Mounts) { - return fmt.Errorf("malformed mountFds slice. Expected size: %v, got: %v", len(config.Mounts), len(mountFds.sourceFds)) - } - - if mountFds.idmapFds != nil && len(mountFds.idmapFds) != len(config.Mounts) { - return fmt.Errorf("malformed idmapFds slice: expected size: %v, got: %v", len(config.Mounts), len(mountFds.idmapFds)) - } - mountConfig := &mountConfig{ root: config.Rootfs, label: config.MountLabel, @@ -86,22 +79,48 @@ func prepareRootfs(pipe io.ReadWriter, iConfig *initConfig, mountFds mountFds) ( cgroupns: config.Namespaces.Contains(configs.NEWCGROUP), noMountFallback: config.NoMountFallback, } - for i, m := range config.Mounts { + for _, m := range config.Mounts { entry := mountEntry{Mount: m} - // Just before the loop we checked that if not empty, len(mountFds.sourceFds) == len(config.Mounts). - // Therefore, we can access mountFds.sourceFds[i] without any concerns. - if mountFds.sourceFds != nil && mountFds.sourceFds[i] != -1 { - entry.srcFD = &mountFds.sourceFds[i] + // Figure out whether we need to request runc to give us an + // open_tree(2)-style mountfd. For idmapped mounts, this is always + // necessary. For bind-mounts, this is only necessary if we cannot + // resolve the parent mount (this is only hit if you are running in a + // userns). + wantSourceFile := m.IsIDMapped() + if m.IsBind() { + if _, err := os.Stat(m.Source); err != nil { + wantSourceFile = true + } } - - // We validated before we can access mountFds.idmapFds[i]. - if mountFds.idmapFds != nil && mountFds.idmapFds[i] != -1 { - if entry.srcFD != nil { - return fmt.Errorf("malformed mountFds and idmapFds slice, entry: %v has fds in both slices", i) + if wantSourceFile { + // Request a source file from the host. + if err := writeSyncArg(pipe, procMountPlease, m); err != nil { + return fmt.Errorf("failed to request mountfd for %q: %w", m.Source, err) + } + sync, err := readSyncFull(pipe, procMountFd) + if err != nil { + return fmt.Errorf("mountfd request for %q failed: %w", m.Source, err) + } + if sync.File == nil { + return fmt.Errorf("mountfd request for %q: response missing attached fd", m.Source) + } + defer sync.File.Close() + // Sanity-check to make sure we didn't get the wrong fd back. This + // should never happen. + if sync.File.Name() != m.Source { + return fmt.Errorf("returned mountfd for %q doesn't match requested mount configuration: mountfd path is %q", m.Source, sync.File.Name()) + } + // Unmarshal the procMountFd argument (the file is sync.File). + var src mountSource + if sync.Arg == nil { + return fmt.Errorf("sync %q is missing an argument", sync.Type) + } + if err := json.Unmarshal(*sync.Arg, &src); err != nil { + return fmt.Errorf("invalid mount fd response argument %q: %w", string(*sync.Arg), err) } - entry.srcFD = &mountFds.idmapFds[i] + src.file = sync.File + entry.srcFile = &src } - if err := mountToRootfs(mountConfig, entry); err != nil { return fmt.Errorf("error mounting %q to rootfs at %q: %w", m.Source, m.Destination, err) } @@ -284,7 +303,7 @@ func mountCgroupV1(m *configs.Mount, c *mountConfig) error { if err := os.MkdirAll(subsystemPath, 0o755); err != nil { return err } - if err := utils.WithProcfd(c.root, b.Destination, func(dstFD string) error { + if err := utils.WithProcfd(c.root, b.Destination, func(dstFd string) error { flags := defaultMountFlags if m.Flags&unix.MS_RDONLY != 0 { flags = flags | unix.MS_RDONLY @@ -297,7 +316,7 @@ func mountCgroupV1(m *configs.Mount, c *mountConfig) error { data = cgroups.CgroupNamePrefix + data source = "systemd" } - return mountViaFDs(source, nil, b.Destination, dstFD, "cgroup", uintptr(flags), data) + return mountViaFds(source, nil, b.Destination, dstFd, "cgroup", uintptr(flags), data) }); err != nil { return err } @@ -328,8 +347,8 @@ func mountCgroupV2(m *configs.Mount, c *mountConfig) error { if err := os.MkdirAll(dest, 0o755); err != nil { return err } - err = utils.WithProcfd(c.root, m.Destination, func(dstFD string) error { - return mountViaFDs(m.Source, nil, m.Destination, dstFD, "cgroup2", uintptr(m.Flags), m.Data) + err = utils.WithProcfd(c.root, m.Destination, func(dstFd string) error { + return mountViaFds(m.Source, nil, m.Destination, dstFd, "cgroup2", uintptr(m.Flags), m.Data) }) if err == nil || !(errors.Is(err, unix.EPERM) || errors.Is(err, unix.EBUSY)) { return err @@ -350,7 +369,6 @@ func mountCgroupV2(m *configs.Mount, c *mountConfig) error { bindM.Source = c.cgroup2Path } // mountToRootfs() handles remounting for MS_RDONLY. - // No need to set mountEntry.srcFD here, because mountToRootfs() calls utils.WithProcfd() by itself in mountPropagate(). err = mountToRootfs(c, mountEntry{Mount: bindM}) if c.rootlessCgroups && errors.Is(err, unix.ENOENT) { // ENOENT (for `src = c.cgroup2Path`) happens when rootless runc is being executed @@ -395,15 +413,15 @@ func doTmpfsCopyUp(m mountEntry, rootfs, mountLabel string) (Err error) { } }() - return utils.WithProcfd(rootfs, m.Destination, func(dstFD string) (Err error) { + return utils.WithProcfd(rootfs, m.Destination, func(dstFd string) (Err error) { // Copy the container data to the host tmpdir. We append "/" to force // CopyDirectory to resolve the symlink rather than trying to copy the // symlink itself. - if err := fileutils.CopyDirectory(dstFD+"/", tmpDir); err != nil { - return fmt.Errorf("tmpcopyup: failed to copy %s to %s (%s): %w", m.Destination, dstFD, tmpDir, err) + if err := fileutils.CopyDirectory(dstFd+"/", tmpDir); err != nil { + return fmt.Errorf("tmpcopyup: failed to copy %s to %s (%s): %w", m.Destination, dstFd, tmpDir, err) } // Now move the mount into the container. - if err := mountViaFDs(tmpDir, nil, m.Destination, dstFD, "", unix.MS_MOVE, ""); err != nil { + if err := mountViaFds(tmpDir, nil, m.Destination, dstFd, "", unix.MS_MOVE, ""); err != nil { return fmt.Errorf("tmpcopyup: failed to move mount: %w", err) } return nil @@ -480,35 +498,9 @@ func mountToRootfs(c *mountConfig, m mountEntry) error { if err := prepareBindMount(m, rootfs); err != nil { return err } - - if m.IsBind() && m.IsIDMapped() { - if m.srcFD == nil { - return fmt.Errorf("error creating mount %+v: idmapFD is invalid, should point to a valid fd", m) - } - if err := unix.MoveMount(*m.srcFD, "", -1, dest, unix.MOVE_MOUNT_F_EMPTY_PATH); err != nil { - return fmt.Errorf("error on unix.MoveMount %+v: %w", m, err) - } - - // In nsexec.c, we did not set the propagation field of mount_attr struct. - // So, let's deal with these flags right now! - if err := utils.WithProcfd(rootfs, dest, func(dstFD string) error { - for _, pflag := range m.PropagationFlags { - // When using mount for setting propagations flags, the source, file - // system type and data arguments are ignored: - // https://man7.org/linux/man-pages/man2/mount.2.html - // We also ignore procfd because we want to act on dest. - if err := mountViaFDs("", nil, dest, dstFD, "", uintptr(pflag), ""); err != nil { - return err - } - } - return nil - }); err != nil { - return fmt.Errorf("change mount propagation through procfd: %w", err) - } - } else { - if err := mountPropagate(m, rootfs, mountLabel); err != nil { - return err - } + // open_tree()-related shenanigans are all handled in mountViaFds. + if err := mountPropagate(m, rootfs, mountLabel); err != nil { + return err } // bind mount won't change mount options, we need remount to make mount options effective. // first check that we have non-default options required before attempting a remount @@ -732,8 +724,8 @@ func bindMountDeviceNode(rootfs, dest string, node *devices.Device) error { if f != nil { _ = f.Close() } - return utils.WithProcfd(rootfs, dest, func(dstFD string) error { - return mountViaFDs(node.Path, nil, dest, dstFD, "bind", unix.MS_BIND, "") + return utils.WithProcfd(rootfs, dest, func(dstFd string) error { + return mountViaFds(node.Path, nil, dest, dstFd, "bind", unix.MS_BIND, "") }) } @@ -1104,9 +1096,9 @@ func writeSystemProperty(key, value string) error { } func remount(m mountEntry, rootfs string, noMountFallback bool) error { - return utils.WithProcfd(rootfs, m.Destination, func(dstFD string) error { + return utils.WithProcfd(rootfs, m.Destination, func(dstFd string) error { flags := uintptr(m.Flags | unix.MS_REMOUNT) - err := mountViaFDs(m.Source, m.srcFD, m.Destination, dstFD, m.Device, flags, "") + err := mountViaFds("", nil, m.Destination, dstFd, m.Device, flags, "") if err == nil { return nil } @@ -1130,7 +1122,7 @@ func remount(m mountEntry, rootfs string, noMountFallback bool) error { } // ... and retry the mount with flags found above. flags |= uintptr(int(s.Flags) & checkflags) - return mountViaFDs(m.Source, m.srcFD, m.Destination, dstFD, m.Device, flags, "") + return mountViaFds("", nil, m.Destination, dstFd, m.Device, flags, "") }) } @@ -1153,17 +1145,17 @@ func mountPropagate(m mountEntry, rootfs string, mountLabel string) error { // mutating underneath us, we verify that we are actually going to mount // inside the container with WithProcfd() -- mounting through a procfd // mounts on the target. - if err := utils.WithProcfd(rootfs, m.Destination, func(dstFD string) error { - return mountViaFDs(m.Source, m.srcFD, m.Destination, dstFD, m.Device, uintptr(flags), data) + if err := utils.WithProcfd(rootfs, m.Destination, func(dstFd string) error { + return mountViaFds(m.Source, m.srcFile, m.Destination, dstFd, m.Device, uintptr(flags), data) }); err != nil { return err } // We have to apply mount propagation flags in a separate WithProcfd() call // because the previous call invalidates the passed procfd -- the mount // target needs to be re-opened. - if err := utils.WithProcfd(rootfs, m.Destination, func(dstFD string) error { + if err := utils.WithProcfd(rootfs, m.Destination, func(dstFd string) error { for _, pflag := range m.PropagationFlags { - if err := mountViaFDs("", nil, m.Destination, dstFD, "", uintptr(pflag), ""); err != nil { + if err := mountViaFds("", nil, m.Destination, dstFd, "", uintptr(pflag), ""); err != nil { return err } } diff --git a/libcontainer/seccomp/patchbpf/enosys_linux.go b/libcontainer/seccomp/patchbpf/enosys_linux.go index 7fc9fd662c3..40e51533410 100644 --- a/libcontainer/seccomp/patchbpf/enosys_linux.go +++ b/libcontainer/seccomp/patchbpf/enosys_linux.go @@ -690,17 +690,17 @@ func sysSeccompSetFilter(flags uint, filter []unix.SockFilter) (fd int, err erro // patches said filter to handle -ENOSYS in a much nicer manner than the // default libseccomp default action behaviour, and loads the patched filter // into the kernel for the current process. -func PatchAndLoad(config *configs.Seccomp, filter *libseccomp.ScmpFilter) (int, error) { +func PatchAndLoad(config *configs.Seccomp, filter *libseccomp.ScmpFilter) (*os.File, error) { // Generate a patched filter. fprog, err := enosysPatchFilter(config, filter) if err != nil { - return -1, fmt.Errorf("error patching filter: %w", err) + return nil, fmt.Errorf("error patching filter: %w", err) } // Get the set of libseccomp flags set. seccompFlags, noNewPrivs, err := filterFlags(config, filter) if err != nil { - return -1, fmt.Errorf("unable to fetch seccomp filter flags: %w", err) + return nil, fmt.Errorf("unable to fetch seccomp filter flags: %w", err) } // Set no_new_privs if it was requested, though in runc we handle @@ -708,15 +708,14 @@ func PatchAndLoad(config *configs.Seccomp, filter *libseccomp.ScmpFilter) (int, if noNewPrivs { logrus.Warnf("potentially misconfigured filter -- setting no_new_privs in seccomp path") if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil { - return -1, fmt.Errorf("error enabling no_new_privs bit: %w", err) + return nil, fmt.Errorf("error enabling no_new_privs bit: %w", err) } } // Finally, load the filter. fd, err := sysSeccompSetFilter(seccompFlags, fprog) if err != nil { - return -1, fmt.Errorf("error loading seccomp filter: %w", err) + return nil, fmt.Errorf("error loading seccomp filter: %w", err) } - - return fd, nil + return os.NewFile(uintptr(fd), "[seccomp filter]"), nil } diff --git a/libcontainer/seccomp/seccomp_linux.go b/libcontainer/seccomp/seccomp_linux.go index fed02bcedc4..2c64ebbbadd 100644 --- a/libcontainer/seccomp/seccomp_linux.go +++ b/libcontainer/seccomp/seccomp_linux.go @@ -6,6 +6,7 @@ package seccomp import ( "errors" "fmt" + "os" libseccomp "github.com/seccomp/libseccomp-golang" "github.com/sirupsen/logrus" @@ -27,17 +28,16 @@ const ( ) // InitSeccomp installs the seccomp filters to be used in the container as -// specified in config. -// Returns the seccomp file descriptor if any of the filters include a -// SCMP_ACT_NOTIFY action, otherwise returns -1. -func InitSeccomp(config *configs.Seccomp) (int, error) { +// specified in config. Returns the seccomp file descriptor if any of the +// filters include a SCMP_ACT_NOTIFY action. +func InitSeccomp(config *configs.Seccomp) (*os.File, error) { if config == nil { - return -1, errors.New("cannot initialize Seccomp - nil config passed") + return nil, errors.New("cannot initialize Seccomp - nil config passed") } defaultAction, err := getAction(config.DefaultAction, config.DefaultErrnoRet) if err != nil { - return -1, errors.New("error initializing seccomp - invalid default action") + return nil, errors.New("error initializing seccomp - invalid default action") } // Ignore the error since pre-2.4 libseccomp is treated as API level 0. @@ -45,7 +45,7 @@ func InitSeccomp(config *configs.Seccomp) (int, error) { for _, call := range config.Syscalls { if call.Action == configs.Notify { if apiLevel < 6 { - return -1, fmt.Errorf("seccomp notify unsupported: API level: got %d, want at least 6. Please try with libseccomp >= 2.5.0 and Linux >= 5.7", apiLevel) + return nil, fmt.Errorf("seccomp notify unsupported: API level: got %d, want at least 6. Please try with libseccomp >= 2.5.0 and Linux >= 5.7", apiLevel) } // We can't allow the write syscall to notify to the seccomp agent. @@ -61,36 +61,36 @@ func InitSeccomp(config *configs.Seccomp) (int, error) { // agent allows those syscalls to proceed, initialization works just fine and the agent can // handle future read()/close() syscalls as it wanted. if call.Name == "write" { - return -1, errors.New("SCMP_ACT_NOTIFY cannot be used for the write syscall") + return nil, errors.New("SCMP_ACT_NOTIFY cannot be used for the write syscall") } } } // See comment on why write is not allowed. The same reason applies, as this can mean handling write too. if defaultAction == libseccomp.ActNotify { - return -1, errors.New("SCMP_ACT_NOTIFY cannot be used as default action") + return nil, errors.New("SCMP_ACT_NOTIFY cannot be used as default action") } filter, err := libseccomp.NewFilter(defaultAction) if err != nil { - return -1, fmt.Errorf("error creating filter: %w", err) + return nil, fmt.Errorf("error creating filter: %w", err) } // Add extra architectures for _, arch := range config.Architectures { scmpArch, err := libseccomp.GetArchFromString(arch) if err != nil { - return -1, fmt.Errorf("error validating Seccomp architecture: %w", err) + return nil, fmt.Errorf("error validating Seccomp architecture: %w", err) } if err := filter.AddArch(scmpArch); err != nil { - return -1, fmt.Errorf("error adding architecture to seccomp filter: %w", err) + return nil, fmt.Errorf("error adding architecture to seccomp filter: %w", err) } } // Add extra flags. for _, flag := range config.Flags { if err := setFlag(filter, flag); err != nil { - return -1, err + return nil, err } } @@ -110,25 +110,24 @@ func InitSeccomp(config *configs.Seccomp) (int, error) { // Unset no new privs bit if err := filter.SetNoNewPrivsBit(false); err != nil { - return -1, fmt.Errorf("error setting no new privileges: %w", err) + return nil, fmt.Errorf("error setting no new privileges: %w", err) } // Add a rule for each syscall for _, call := range config.Syscalls { if call == nil { - return -1, errors.New("encountered nil syscall while initializing Seccomp") + return nil, errors.New("encountered nil syscall while initializing Seccomp") } if err := matchCall(filter, call, defaultAction); err != nil { - return -1, err + return nil, err } } seccompFd, err := patchbpf.PatchAndLoad(config, filter) if err != nil { - return -1, fmt.Errorf("error loading seccomp filter into kernel: %w", err) + return nil, fmt.Errorf("error loading seccomp filter into kernel: %w", err) } - return seccompFd, nil } diff --git a/libcontainer/seccomp/seccomp_unsupported.go b/libcontainer/seccomp/seccomp_unsupported.go index 885529dc7d0..b08a3498687 100644 --- a/libcontainer/seccomp/seccomp_unsupported.go +++ b/libcontainer/seccomp/seccomp_unsupported.go @@ -5,6 +5,7 @@ package seccomp import ( "errors" + "os" "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runtime-spec/specs-go" @@ -13,11 +14,11 @@ import ( var ErrSeccompNotEnabled = errors.New("seccomp: config provided but seccomp not supported") // InitSeccomp does nothing because seccomp is not supported. -func InitSeccomp(config *configs.Seccomp) (int, error) { +func InitSeccomp(config *configs.Seccomp) (*os.File, error) { if config != nil { - return -1, ErrSeccompNotEnabled + return nil, ErrSeccompNotEnabled } - return -1, nil + return nil, nil } // FlagSupported tells if a provided seccomp flag is supported. diff --git a/libcontainer/setns_init_linux.go b/libcontainer/setns_init_linux.go index ac58190758a..40a47a2e95c 100644 --- a/libcontainer/setns_init_linux.go +++ b/libcontainer/setns_init_linux.go @@ -75,7 +75,6 @@ func (l *linuxSetnsInit) Init() error { if err != nil { return err } - if err := syncParentSeccomp(l.pipe, seccompFd); err != nil { return err } @@ -94,7 +93,6 @@ func (l *linuxSetnsInit) Init() error { if err != nil { return fmt.Errorf("unable to init seccomp: %w", err) } - if err := syncParentSeccomp(l.pipe, seccompFd); err != nil { return err } diff --git a/libcontainer/standard_init_linux.go b/libcontainer/standard_init_linux.go index f3d04282362..3d97b06d1b5 100644 --- a/libcontainer/standard_init_linux.go +++ b/libcontainer/standard_init_linux.go @@ -25,7 +25,6 @@ type linuxStandardInit struct { parentPid int fifoFd int logFd int - mountFds mountFds config *initConfig } @@ -86,17 +85,7 @@ func (l *linuxStandardInit) Init() error { // initialises the labeling system selinux.GetEnabled() - // We don't need the mount nor idmap fds after prepareRootfs() nor if it fails. - err := prepareRootfs(l.pipe, l.config, l.mountFds) - for _, m := range append(l.mountFds.sourceFds, l.mountFds.idmapFds...) { - if m == -1 { - continue - } - if err := unix.Close(m); err != nil { - return fmt.Errorf("unable to close mountFds fds: %w", err) - } - } - + err := prepareRootfs(l.pipe, l.config) if err != nil { return err } diff --git a/libcontainer/sync.go b/libcontainer/sync.go index 25dc2863071..2cb659ff94e 100644 --- a/libcontainer/sync.go +++ b/libcontainer/sync.go @@ -5,6 +5,7 @@ import ( "errors" "fmt" "io" + "os" "github.com/opencontainers/runc/libcontainer/utils" ) @@ -15,15 +16,24 @@ type syncType string // during container setup. They come in pairs (with procError being a generic // response which is followed by an &initError). // -// [ child ] <-> [ parent ] +// [ child ] <-> [ parent ] // -// procHooks --> [run hooks] -// <-- procResume +// procMountPlease --> [open_tree(2) and configure mount] +// Arg: configs.Mount +// <-- procMountFd +// file: mountfd // -// procReady --> [final setup] -// <-- procRun +// procSeccomp --> [forward fd to listenerPath] +// file: seccomp fd +// --- no return synchronisation // -// procSeccomp --> [pick up seccomp fd with pidfd_getfd()] +// procHooks --> [run hooks] +// <-- procResume +// +// procReady --> [final setup] +// <-- procRun +// +// procSeccomp --> [grab seccomp fd with pidfd_getfd()] // <-- procSeccompDone const ( procError syncType = "procError" @@ -31,13 +41,23 @@ const ( procRun syncType = "procRun" procHooks syncType = "procHooks" procResume syncType = "procResume" + procMountPlease syncType = "procMountPlease" + procMountFd syncType = "procMountFd" procSeccomp syncType = "procSeccomp" procSeccompDone syncType = "procSeccompDone" ) +type syncFlags int + +const ( + syncFlagHasFd syncFlags = (1 << iota) +) + type syncT struct { - Type syncType `json:"type"` - Fd int `json:"fd"` + Type syncType `json:"type"` + Flags syncFlags `json:"flags"` + Arg *json.RawMessage `json:"arg,omitempty"` + File *os.File `json:"-"` // passed oob through SCM_RIGHTS } // initError is used to wrap errors for passing them via JSON, @@ -50,74 +70,100 @@ func (i initError) Error() string { return i.Message } -// writeSync is used to write to a synchronisation pipe. An error is returned -// if there was a problem writing the payload. -func writeSync(pipe io.Writer, sync syncType) error { - return writeSyncWithFd(pipe, sync, -1) +func doWriteSync(pipe *os.File, sync syncT) error { + sync.Flags &= ^syncFlagHasFd + if sync.File != nil { + sync.Flags |= syncFlagHasFd + } + if err := utils.WriteJSON(pipe, sync); err != nil { + return fmt.Errorf("writing sync %q: %w", sync.Type, err) + } + if sync.Flags&syncFlagHasFd != 0 { + if err := utils.SendFile(pipe, sync.File); err != nil { + return fmt.Errorf("sending file after sync %q: %w", sync.Type, err) + } + } + return nil } -// writeSyncWithFd is used to write to a synchronisation pipe. An error is -// returned if there was a problem writing the payload. -func writeSyncWithFd(pipe io.Writer, sync syncType, fd int) error { - if err := utils.WriteJSON(pipe, syncT{sync, fd}); err != nil { - return fmt.Errorf("writing syncT %q: %w", string(sync), err) +func writeSync(pipe *os.File, sync syncType) error { + return doWriteSync(pipe, syncT{Type: sync}) +} + +func writeSyncArg(pipe *os.File, sync syncType, arg interface{}) error { + argJSON, err := json.Marshal(arg) + if err != nil { + return fmt.Errorf("writing sync %q: marshal argument failed: %w", sync, err) } - return nil + argJSONMsg := json.RawMessage(argJSON) + return doWriteSync(pipe, syncT{Type: sync, Arg: &argJSONMsg}) } -// readSync is used to read from a synchronisation pipe. An error is returned -// if we got an initError, the pipe was closed, or we got an unexpected flag. -func readSync(pipe io.Reader, expected syncType) error { - var procSync syncT - if err := json.NewDecoder(pipe).Decode(&procSync); err != nil { +func doReadSync(pipe *os.File) (syncT, error) { + var sync syncT + if err := json.NewDecoder(pipe).Decode(&sync); err != nil { if errors.Is(err, io.EOF) { - return errors.New("parent closed synchronisation channel") + return sync, err } - return fmt.Errorf("failed reading error from parent: %w", err) + return sync, fmt.Errorf("reading from parent failed: %w", err) } - - if procSync.Type == procError { + if sync.Type == procError { var ierr initError - - if err := json.NewDecoder(pipe).Decode(&ierr); err != nil { - return fmt.Errorf("failed reading error from parent: %w", err) + if sync.Arg == nil { + return sync, errors.New("procError missing error payload") + } + if err := json.Unmarshal(*sync.Arg, &ierr); err != nil { + return sync, fmt.Errorf("unmarshal procError failed: %w", err) + } + return sync, &ierr + } + if sync.Flags&syncFlagHasFd != 0 { + file, err := utils.RecvFile(pipe) + if err != nil { + return sync, fmt.Errorf("receiving fd from sync %q failed: %w", sync.Type, err) } + sync.File = file + } + return sync, nil +} - return &ierr +func readSyncFull(pipe *os.File, expected syncType) (syncT, error) { + sync, err := doReadSync(pipe) + if err != nil { + return sync, err } + if sync.Type != expected { + return sync, fmt.Errorf("unexpected synchronisation flag: got %q, expected %q", sync.Type, expected) + } + return sync, nil +} - if procSync.Type != expected { - return errors.New("invalid synchronisation flag from parent") +func readSync(pipe *os.File, expected syncType) error { + sync, err := readSyncFull(pipe, expected) + if err != nil { + return err + } + if sync.Arg != nil { + return fmt.Errorf("sync %q had unexpected argument passed: %q", expected, string(*sync.Arg)) + } + if sync.File != nil { + _ = sync.File.Close() + return fmt.Errorf("sync %q had unexpected file passed", sync.Type) } return nil } // parseSync runs the given callback function on each syncT received from the // child. It will return once io.EOF is returned from the given pipe. -func parseSync(pipe io.Reader, fn func(*syncT) error) error { - dec := json.NewDecoder(pipe) +func parseSync(pipe *os.File, fn func(*syncT) error) error { for { - var sync syncT - if err := dec.Decode(&sync); err != nil { + sync, err := doReadSync(pipe) + if err != nil { if errors.Is(err, io.EOF) { break } return err } - - // We handle this case outside fn for cleanliness reasons. - var ierr *initError - if sync.Type == procError { - if err := dec.Decode(&ierr); err != nil && !errors.Is(err, io.EOF) { - return fmt.Errorf("error decoding proc error from init: %w", err) - } - if ierr != nil { - return ierr - } - // Programmer error. - panic("No error following JSON procError payload.") - } - if err := fn(&sync); err != nil { return err } diff --git a/libcontainer/system/linux.go b/libcontainer/system/linux.go index e1d6eb18034..200d375e840 100644 --- a/libcontainer/system/linux.go +++ b/libcontainer/system/linux.go @@ -4,10 +4,14 @@ package system import ( + "errors" + "fmt" + "io" "os" "os/exec" "unsafe" + "github.com/sirupsen/logrus" "golang.org/x/sys/unix" ) @@ -102,3 +106,40 @@ func GetSubreaper() (int, error) { return int(i), nil } + +func ExecutableMemfd(comment string, flags int) (*os.File, error) { + // Try to use MFD_EXEC first. On pre-6.3 kernels we get -EINVAL for this + // flag. On post-6.3 kernels, with vm.memfd_noexec=1 this ensures we get an + // executable memfd. For vm.memfd_noexec=2 this is a bit more complicated. + // The original vm.memfd_noexec=2 implementation incorrectly silently + // allowed MFD_EXEC[1] -- this should be fixed in 6.6. On 6.6 and newer + // kernels, we will get -EACCES if we try to use MFD_EXEC with + // vm.memfd_noexec=2 (for 6.3-6.5, -EINVAL was the intended return value). + // + // The upshot is we only need to retry without MFD_EXEC on -EINVAL because + // it just so happens that passing MFD_EXEC bypasses vm.memfd_noexec=2 on + // kernels where -EINVAL is actually a security denial. + memfd, err := unix.MemfdCreate(comment, flags|unix.MFD_EXEC) + if errors.Is(err, unix.EINVAL) { + memfd, err = unix.MemfdCreate(comment, flags) + } + if err != nil { + if errors.Is(err, unix.EACCES) { + logrus.Infof("memfd_create(MFD_EXEC) failed, possibly due to vm.memfd_noexec=2 -- falling back to less secure O_TMPFILE") + } + err := os.NewSyscallError("memfd_create", err) + return nil, fmt.Errorf("failed to create executable memfd: %w", err) + } + return os.NewFile(uintptr(memfd), "/memfd:"+comment), nil +} + +// Copy is a wrapper around io.Copy that continues the copy despite EINTR. +func Copy(dst io.Writer, src io.Reader) (written int64, err error) { + for { + n, err := io.Copy(dst, src) + written += n + if !errors.Is(err, unix.EINTR) { + return written, err + } + } +} diff --git a/libcontainer/userns/usernsfd_linux.go b/libcontainer/userns/usernsfd_linux.go new file mode 100644 index 00000000000..0f518c5d5fe --- /dev/null +++ b/libcontainer/userns/usernsfd_linux.go @@ -0,0 +1,134 @@ +package userns + +import ( + "fmt" + "os" + "sort" + "strings" + "sync" + "syscall" + + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" + + "github.com/opencontainers/runc/libcontainer/configs" +) + +type Mapping struct { + UIDMappings []configs.IDMap + GIDMappings []configs.IDMap +} + +func (m Mapping) toSys() (uids, gids []syscall.SysProcIDMap) { + for _, uid := range m.UIDMappings { + uids = append(uids, syscall.SysProcIDMap{ + ContainerID: uid.ContainerID, + HostID: uid.HostID, + Size: uid.Size, + }) + } + for _, gid := range m.GIDMappings { + gids = append(gids, syscall.SysProcIDMap{ + ContainerID: gid.ContainerID, + HostID: gid.HostID, + Size: gid.Size, + }) + } + return +} + +// id returns a unique identifier for this mapping, agnostic of the order of +// the uid and gid mappings (because the order doesn't matter to the kernel). +// The set of userns handles is indexed using this ID. +func (m Mapping) id() string { + var uids, gids []string + for _, idmap := range m.UIDMappings { + uids = append(uids, fmt.Sprintf("%d:%d:%d", idmap.ContainerID, idmap.HostID, idmap.Size)) + } + for _, idmap := range m.GIDMappings { + gids = append(gids, fmt.Sprintf("%d:%d:%d", idmap.ContainerID, idmap.HostID, idmap.Size)) + } + // We don't care about the sort order -- just sort them. + sort.Strings(uids) + sort.Strings(gids) + return "uid=" + strings.Join(uids, ",") + ";gid=" + strings.Join(gids, ",") +} + +type Handles struct { + m sync.Mutex + procs map[string]*os.Process +} + +// Release kills all of the child processes created with this Handle, releasing +// the resources used to create existing userns handles. The kernel guarantees +// all existing files returned from Get() will continue to work even after +// calling Release(). The same Handles can be re-used after calling Release(). +func (hs *Handles) Release() { + hs.m.Lock() + defer hs.m.Unlock() + + for _, proc := range hs.procs { + // We can't do anything about errors here, and in the case of runc we + // don't care because in the worst case we end up with zombies that + // pid1 can deal with. + _ = proc.Kill() + _, _ = proc.Wait() + } + hs.procs = make(map[string]*os.Process) +} + +func spawnProc(req Mapping) (*os.Process, error) { + // We need to spawn a subprocess with the requested mappings, which is + // unfortunately quite expensive. The "safe" way of doing this is natively + // with Go (and then spawning something like "sleep infinity"), but + // execve() is a waste of cycles because we just need some process to have + // the right mapping, we don't care what it's executing. The "unsafe" + // option of doing a clone() behind the back of Go is probably okay in + // theory as long as we just do kill(getpid(), SIGSTOP). However, if we + // tell Go to put the new process into PTRACE_TRACEME mode, we can avoid + // the exec and not have to faff around with the mappings. + // + // Note that Go's stdlib does not support newuidmap, but in the case of + // id-mapped mounts, it seems incredibly unlikely that the user will be + // requesting us to do a remapping as an unprivileged user with mappings + // they have privileges over. + logrus.Debugf("spawning dummy process for id-mapping %s", req.id()) + uidMappings, gidMappings := req.toSys() + return os.StartProcess("/proc/self/exe", []string{"runc", "--help"}, &os.ProcAttr{ + Sys: &syscall.SysProcAttr{ + Cloneflags: unix.CLONE_NEWUSER, + UidMappings: uidMappings, + GidMappings: gidMappings, + GidMappingsEnableSetgroups: false, + // Put the process into PTRACE_TRACEME mode to allow us to get the + // userns without having a proper execve() target. + Ptrace: true, + }, + }) +} + +// Get returns a handle to a /proc/$pid/ns/user nsfs file with the requested +// mapping. The processes spawned to produce userns nsfds are cached, so if +// equivalent user namespace mappings are requested, the same user namespace +// will be returned. The caller is responsible for closing the returned file +// descriptor. +func (hs *Handles) Get(req Mapping) (f *os.File, err error) { + hs.m.Lock() + defer hs.m.Unlock() + + if hs.procs == nil { + hs.procs = make(map[string]*os.Process) + } + + proc, ok := hs.procs[req.id()] + if !ok { + proc, err = spawnProc(req) + if err != nil { + return nil, fmt.Errorf("failed to spawn dummy process for map %s: %w", req.id(), err) + } + // TODO: Maybe we should just cache handles to /proc/$pid/ns/user, so we + // can kill the process immediately? + hs.procs[req.id()] = proc + } + return os.Open(fmt.Sprintf("/proc/%d/ns/user", proc.Pid)) +} diff --git a/libcontainer/userns/usernsfd_linux_test.go b/libcontainer/userns/usernsfd_linux_test.go new file mode 100644 index 00000000000..c28f55d8eff --- /dev/null +++ b/libcontainer/userns/usernsfd_linux_test.go @@ -0,0 +1,52 @@ +package userns + +import ( + "os" + "testing" + + "github.com/opencontainers/runc/libcontainer/configs" +) + +func BenchmarkSpawnProc(b *testing.B) { + if os.Geteuid() != 0 { + b.Skip("spawning user namespaced processes requires root") + } + + // We can re-use the mapping as we call spawnProc() directly. + mapping := Mapping{ + UIDMappings: []configs.IDMap{ + {ContainerID: 0, HostID: 1337, Size: 142}, + {ContainerID: 150, HostID: 0, Size: 1}, + {ContainerID: 442, HostID: 1111, Size: 12}, + {ContainerID: 1000, HostID: 9999, Size: 92}, + {ContainerID: 9999, HostID: 1000000, Size: 4}, + // Newer kernels support more than 5 entries, but stick to 5 here. + }, + GIDMappings: []configs.IDMap{ + {ContainerID: 1, HostID: 2337, Size: 142}, + {ContainerID: 145, HostID: 6, Size: 1}, + {ContainerID: 200, HostID: 1000, Size: 12}, + {ContainerID: 1000, HostID: 9888, Size: 92}, + {ContainerID: 8999, HostID: 1000000, Size: 4}, + // Newer kernels support more than 5 entries, but stick to 5 here. + }, + } + + procs := make([]*os.Process, 0, b.N) + b.Cleanup(func() { + for _, proc := range procs { + if proc != nil { + _ = proc.Kill() + _, _ = proc.Wait() + } + } + }) + + for i := 0; i < b.N; i++ { + proc, err := spawnProc(mapping) + if err != nil { + b.Error(err) + } + procs = append(procs, proc) + } +} diff --git a/libcontainer/utils/cmsg.go b/libcontainer/utils/cmsg.go index fd9326cb59a..c4c122e41fe 100644 --- a/libcontainer/utils/cmsg.go +++ b/libcontainer/utils/cmsg.go @@ -19,13 +19,14 @@ package utils import ( "fmt" "os" + "runtime" "golang.org/x/sys/unix" ) -// MaxNameLen is the maximum length of the name of a file descriptor being -// sent using SendFd. The name of the file handle returned by RecvFd will never -// be larger than this value. +// MaxNameLen is the maximum length of the name of a file descriptor being sent +// using SendFd. The name of the file handle returned by RecvFile will never be +// larger than this value. const MaxNameLen = 4096 // oobSpace is the size of the oob slice required to store a single FD. Note @@ -33,26 +34,21 @@ const MaxNameLen = 4096 // so sizeof(fd) = 4. var oobSpace = unix.CmsgSpace(4) -// RecvFd waits for a file descriptor to be sent over the given AF_UNIX +// RecvFile waits for a file descriptor to be sent over the given AF_UNIX // socket. The file name of the remote file descriptor will be recreated // locally (it is sent as non-auxiliary data in the same payload). -func RecvFd(socket *os.File) (*os.File, error) { - // For some reason, unix.Recvmsg uses the length rather than the capacity - // when passing the msg_controllen and other attributes to recvmsg. So we - // have to actually set the length. +func RecvFile(socket *os.File) (_ *os.File, Err error) { name := make([]byte, MaxNameLen) oob := make([]byte, oobSpace) sockfd := socket.Fd() - n, oobn, _, _, err := unix.Recvmsg(int(sockfd), name, oob, 0) + n, oobn, _, _, err := unix.Recvmsg(int(sockfd), name, oob, unix.MSG_CMSG_CLOEXEC) if err != nil { return nil, err } - if n >= MaxNameLen || oobn != oobSpace { return nil, fmt.Errorf("recvfd: incorrect number of bytes read (n=%d oobn=%d)", n, oobn) } - // Truncate. name = name[:n] oob = oob[:oobn] @@ -61,36 +57,61 @@ func RecvFd(socket *os.File) (*os.File, error) { if err != nil { return nil, err } - if len(scms) != 1 { - return nil, fmt.Errorf("recvfd: number of SCMs is not 1: %d", len(scms)) + + // We cannot control how many SCM_RIGHTS we receive, and upon receiving + // them all of the descriptors are installed in our fd table, so we need to + // parse all of the SCM_RIGHTS we received in order to close all of the + // descriptors on error. + var fds []int + defer func() { + for i, fd := range fds { + // Only close 0 if err != nil, and close everything else. + if i != 0 || err != nil { + _ = unix.Close(fd) + } + } + }() + var lastErr error + for _, scm := range scms { + if scm.Header.Type == unix.SCM_RIGHTS { + scmFds, err := unix.ParseUnixRights(&scm) + if err != nil { + lastErr = err + } else { + fds = append(fds, scmFds...) + } + } + } + if lastErr != nil { + return nil, lastErr } - scm := scms[0] - fds, err := unix.ParseUnixRights(&scm) - if err != nil { - return nil, err + // We do this after collecting the fds to make sure we close them all when + // returning an error here. + if len(scms) != 1 { + return nil, fmt.Errorf("recvfd: number of SCMs is not 1: %d", len(scms)) } if len(fds) != 1 { return nil, fmt.Errorf("recvfd: number of fds is not 1: %d", len(fds)) } - fd := uintptr(fds[0]) - - return os.NewFile(fd, string(name)), nil + return os.NewFile(uintptr(fds[0]), string(name)), nil } -// SendFd sends a file descriptor over the given AF_UNIX socket. In -// addition, the file.Name() of the given file will also be sent as -// non-auxiliary data in the same payload (allowing to send contextual -// information for a file descriptor). -func SendFd(socket *os.File, name string, fd uintptr) error { +// SendFile sends a file over the given AF_UNIX socket. file.Name() is also +// included so that if the other end uses RecvFile, the file will have the same +// name information. +func SendFile(socket *os.File, file *os.File) error { + name := file.Name() if len(name) >= MaxNameLen { return fmt.Errorf("sendfd: filename too long: %s", name) } - return SendFds(socket, []byte(name), int(fd)) + err := SendRawFd(socket, name, file.Fd()) + runtime.KeepAlive(file) + return err } -// SendFds sends a list of files descriptor and msg over the given AF_UNIX socket. -func SendFds(socket *os.File, msg []byte, fds ...int) error { - oob := unix.UnixRights(fds...) - return unix.Sendmsg(int(socket.Fd()), msg, oob, nil, 0) +// SendRawFd sends a specific file descriptor over the given AF_UNIX socket. +func SendRawFd(socket *os.File, msg string, fd uintptr) error { + oob := unix.UnixRights(int(fd)) + return unix.Sendmsg(int(socket.Fd()), []byte(msg), oob, nil, 0) } diff --git a/libcontainer/utils/utils_unix.go b/libcontainer/utils/utils_unix.go index 6b9a7be038f..ca520b63b36 100644 --- a/libcontainer/utils/utils_unix.go +++ b/libcontainer/utils/utils_unix.go @@ -91,7 +91,7 @@ func CloseExecFrom(minFd int) error { } // NewSockPair returns a new unix socket pair -func NewSockPair(name string) (parent *os.File, child *os.File, err error) { +func NewSockPair(name string) (parent, child *os.File, err error) { fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0) if err != nil { return nil, nil, err diff --git a/tests/integration/idmap.bats b/tests/integration/idmap.bats index 375191bf705..0dac40b21e7 100644 --- a/tests/integration/idmap.bats +++ b/tests/integration/idmap.bats @@ -2,6 +2,9 @@ load helpers +OVERFLOW_UID="$(cat /proc/sys/kernel/overflowuid)" +OVERFLOW_GID="$(cat /proc/sys/kernel/overflowgid)" + function setup() { requires root requires_kernel 5.12 @@ -9,152 +12,306 @@ function setup() { setup_debian - # Prepare source folders for bind mount - mkdir -p source-{1,2}/ - touch source-{1,2}/foo.txt + # Prepare source folders for mounts. + mkdir -p source-{1,2,multi{1,2,3}}/ + touch source-{1,2,multi{1,2,3}}/foo.txt + touch source-multi{1,2,3}/{bar,baz}.txt - # Use other owner for source-2 + # Change the owners for everything other than source-1. chown 1:1 source-2/foo.txt - mkdir -p rootfs/{proc,sys,tmp} - mkdir -p rootfs/tmp/mount-{1,2} - mkdir -p rootfs/mnt/bind-mount-{1,2} - - update_config ' .linux.namespaces += [{"type": "user"}] - | .linux.uidMappings += [{"hostID": 100000, "containerID": 0, "size": 65536}] - | .linux.gidMappings += [{"hostID": 100000, "containerID": 0, "size": 65536}] - | .process.args += ["-c", "stat -c =%u=%g= /tmp/mount-1/foo.txt"] - | .mounts += [ - { - "source": "source-1/", - "destination": "/tmp/mount-1", - "options": ["bind"], - "uidMappings": [ { - "containerID": 0, - "hostID": 100000, - "size": 65536 - } - ], - "gidMappings": [ { - "containerID": 0, - "hostID": 100000, - "size": 65536 - } - ] - } - ] ' + chown 100:211 source-multi1/foo.txt + chown 101:222 source-multi1/bar.txt + chown 102:233 source-multi1/baz.txt + + # Same gids as multi1, different uids. + chown 200:211 source-multi2/foo.txt + chown 201:222 source-multi2/bar.txt + chown 202:233 source-multi2/baz.txt + + # 1000 uids, 500 gids + chown 5000528:6000491 source-multi3/foo.txt + chown 5000133:6000337 source-multi3/bar.txt + chown 5000999:6000444 source-multi3/baz.txt } function teardown() { teardown_bundle } -@test "simple idmap mount" { +function setup_idmap_userns() { + update_config '.linux.namespaces += [{"type": "user"}] + | .linux.uidMappings += [{"containerID": 0, "hostID": 100000, "size": 65536}] + | .linux.gidMappings += [{"containerID": 0, "hostID": 100000, "size": 65536}]' +} + +function setup_bind_mount() { + mountname="${1:-1}" + update_config '.mounts += [ + { + "source": "source-'"$mountname"'/", + "destination": "/tmp/bind-mount-'"$mountname"'", + "options": ["bind"] + } + ]' +} + +function setup_idmap_single_mount() { + uidmap="$1" # ctr:host:size + gidmap="${2:-$1}" # ctr:host:size + mountname="${3:-1}" + destname="${4:-$mountname}" + + read -r uid_containerID uid_hostID uid_size <<<"$(tr : ' ' <<<"$uidmap")" + read -r gid_containerID gid_hostID gid_size <<<"$(tr : ' ' <<<"$gidmap")" + + update_config '.mounts += [ + { + "source": "source-'"$mountname"'/", + "destination": "/tmp/mount-'"$destname"'", + "options": ["bind"], + "uidMappings": [{"containerID": '"$uid_containerID"', "hostID": '"$uid_hostID"', "size": '"$uid_size"'}], + "gidMappings": [{"containerID": '"$gid_containerID"', "hostID": '"$gid_hostID"', "size": '"$gid_size"'}] + } + ]' +} + +function setup_idmap_basic_mount() { + mountname="${1:-1}" + setup_idmap_single_mount 0:100000:65536 0:100000:65536 "$mountname" +} + +@test "simple idmap mount [userns]" { + setup_idmap_userns + setup_idmap_basic_mount + + update_config '.process.args = ["sh", "-c", "stat -c =%u=%g= /tmp/mount-1/foo.txt"]' + runc run test_debian [ "$status" -eq 0 ] [[ "$output" == *"=0=0="* ]] } -@test "write to an idmap mount" { - update_config ' .process.args = ["sh", "-c", "touch /tmp/mount-1/bar && stat -c =%u=%g= /tmp/mount-1/bar"]' +@test "simple idmap mount [no userns]" { + setup_idmap_basic_mount + + update_config '.process.args = ["sh", "-c", "stat -c =%u=%g= /tmp/mount-1/foo.txt"]' + + runc run test_debian + [ "$status" -eq 0 ] + [[ "$output" == *"=100000=100000="* ]] +} + +@test "write to an idmap mount [userns]" { + setup_idmap_userns + setup_idmap_basic_mount + + update_config '.process.args = ["sh", "-c", "touch /tmp/mount-1/bar && stat -c =%u=%g= /tmp/mount-1/bar"]' + runc run test_debian [ "$status" -eq 0 ] [[ "$output" == *"=0=0="* ]] } -@test "idmap mount with propagation flag" { - update_config ' .process.args = ["sh", "-c", "findmnt -o PROPAGATION /tmp/mount-1"]' +@test "write to an idmap mount [no userns]" { + setup_idmap_basic_mount + + update_config '.process.args = ["sh", "-c", "touch /tmp/mount-1/bar && stat -c =%u=%g= /tmp/mount-1/bar"]' + + runc run test_debian + # The write must fail because the user is unmapped. + [ "$status" -ne 0 ] + [[ "$output" == *"Value too large for defined data type"* ]] # ERANGE +} + +@test "idmap mount with propagation flag [userns]" { + setup_idmap_userns + setup_idmap_basic_mount + + update_config '.process.args = ["sh", "-c", "findmnt -o PROPAGATION /tmp/mount-1"]' # Add the shared option to the idmap mount - update_config ' .mounts |= map((select(.source == "source-1/") | .options += ["shared"]) // .)' + update_config '.mounts |= map((select(.source == "source-1/") | .options += ["shared"]) // .)' runc run test_debian [ "$status" -eq 0 ] [[ "$output" == *"shared"* ]] } -@test "idmap mount with bind mount" { - update_config ' .mounts += [ - { - "source": "source-2/", - "destination": "/tmp/mount-2", - "options": ["bind"] - } - ] ' +@test "idmap mount with bind mount [userns]" { + setup_idmap_userns + setup_idmap_basic_mount + setup_bind_mount + + update_config '.process.args = ["bash", "-c", "stat -c =%n:%u=%g= /tmp/{,bind-}mount-1/foo.txt"]' runc run test_debian [ "$status" -eq 0 ] - [[ "$output" == *"=0=0="* ]] + [[ "$output" == *"=/tmp/mount-1/foo.txt:0=0="* ]] + [[ "$output" == *"=/tmp/bind-mount-1/foo.txt:$OVERFLOW_UID=$OVERFLOW_GID="* ]] } -@test "two idmap mounts with two bind mounts" { - update_config ' .process.args = ["sh", "-c", "stat -c =%u=%g= /tmp/mount-1/foo.txt /tmp/mount-2/foo.txt"] - | .mounts += [ - { - "source": "source-1/", - "destination": "/mnt/bind-mount-1", - "options": ["bind"] - }, - { - "source": "source-2/", - "destination": "/mnt/bind-mount-2", - "options": ["bind"] - }, - { - "source": "source-2/", - "destination": "/tmp/mount-2", - "options": ["bind"], - "uidMappings": [ { - "containerID": 0, - "hostID": 100000, - "size": 65536 - } - ], - "gidMappings": [ { - "containerID": 0, - "hostID": 100000, - "size": 65536 - } - ] - } - ] ' +@test "idmap mount with bind mount [no userns]" { + setup_idmap_basic_mount + setup_bind_mount + + update_config '.process.args = ["bash", "-c", "stat -c =%n:%u=%g= /tmp/{,bind-}mount-1/foo.txt"]' runc run test_debian [ "$status" -eq 0 ] - [[ "$output" == *"=0=0="* ]] - # source-2/ is owned by 1:1, so we expect this with the idmap mount too. - [[ "$output" == *"=1=1="* ]] + [[ "$output" == *"=/tmp/mount-1/foo.txt:100000=100000="* ]] + [[ "$output" == *"=/tmp/bind-mount-1/foo.txt:0=0="* ]] +} + +@test "two idmap mounts (same mapping) with two bind mounts [userns]" { + setup_idmap_userns + + setup_idmap_basic_mount 1 + setup_bind_mount 1 + setup_bind_mount 2 + setup_idmap_basic_mount 2 + + update_config '.process.args = ["bash", "-c", "stat -c =%n:%u=%g= /tmp/mount-[12]/foo.txt"]' + + runc run test_debian + [ "$status" -eq 0 ] + [[ "$output" == *"=/tmp/mount-1/foo.txt:0=0="* ]] + [[ "$output" == *"=/tmp/mount-2/foo.txt:1=1="* ]] +} + +@test "same idmap mount (different mappings) [userns]" { + setup_idmap_userns + + # Mount the same directory with different mappings. Make sure we also use + # different mappings for uids and gids. + setup_idmap_single_mount 100:100000:100 200:100000:100 multi1 + setup_idmap_single_mount 100:101000:100 200:102000:100 multi1 multi1-alt + + update_config '.process.args = ["bash", "-c", "stat -c =%n:%u=%g= /tmp/mount-multi1{,-alt}/{foo,bar,baz}.txt"]' + + runc run test_debian + [ "$status" -eq 0 ] + [[ "$output" == *"=/tmp/mount-multi1/foo.txt:0=11="* ]] + [[ "$output" == *"=/tmp/mount-multi1/bar.txt:1=22="* ]] + [[ "$output" == *"=/tmp/mount-multi1/baz.txt:2=33="* ]] + [[ "$output" == *"=/tmp/mount-multi1-alt/foo.txt:1000=2011="* ]] + [[ "$output" == *"=/tmp/mount-multi1-alt/bar.txt:1001=2022="* ]] + [[ "$output" == *"=/tmp/mount-multi1-alt/baz.txt:1002=2033="* ]] } -@test "idmap mount without gidMappings fails" { - update_config ' .mounts |= map((select(.source == "source-1/") | del(.gidMappings) ) // .)' +@test "same idmap mount (different mappings) [no userns]" { + # Mount the same directory with different mappings. Make sure we also use + # different mappings for uids and gids. + setup_idmap_single_mount 100:100000:100 200:100000:100 multi1 + setup_idmap_single_mount 100:101000:100 200:102000:100 multi1 multi1-alt + + update_config '.process.args = ["bash", "-c", "stat -c =%n:%u=%g= /tmp/mount-multi1{,-alt}/{foo,bar,baz}.txt"]' runc run test_debian - [ "$status" -eq 1 ] - [[ "${output}" == *"invalid mount"* ]] + [ "$status" -eq 0 ] + [[ "$output" == *"=/tmp/mount-multi1/foo.txt:100000=100011="* ]] + [[ "$output" == *"=/tmp/mount-multi1/bar.txt:100001=100022="* ]] + [[ "$output" == *"=/tmp/mount-multi1/baz.txt:100002=100033="* ]] + [[ "$output" == *"=/tmp/mount-multi1-alt/foo.txt:101000=102011="* ]] + [[ "$output" == *"=/tmp/mount-multi1-alt/bar.txt:101001=102022="* ]] + [[ "$output" == *"=/tmp/mount-multi1-alt/baz.txt:101002=102033="* ]] } -@test "idmap mount without uidMappings fails" { - update_config ' .mounts |= map((select(.source == "source-1/") | del(.uidMappings) ) // .)' +@test "multiple idmap mounts (different mappings) [userns]" { + setup_idmap_userns + + # Make sure we use different mappings for uids and gids. + setup_idmap_single_mount 100:101100:3 200:101900:50 multi1 + setup_idmap_single_mount 200:102200:3 200:102900:100 multi2 + setup_idmap_single_mount 5000000:103000:1000 6000000:103000:500 multi3 + + update_config '.process.args = ["bash", "-c", "stat -c =%n:%u=%g= /tmp/mount-multi[123]/{foo,bar,baz}.txt"]' runc run test_debian - [ "$status" -eq 1 ] - [[ "${output}" == *"invalid mount"* ]] + [ "$status" -eq 0 ] + [[ "$output" == *"=/tmp/mount-multi1/foo.txt:1100=1911="* ]] + [[ "$output" == *"=/tmp/mount-multi1/bar.txt:1101=1922="* ]] + [[ "$output" == *"=/tmp/mount-multi1/baz.txt:1102=1933="* ]] + [[ "$output" == *"=/tmp/mount-multi2/foo.txt:2200=2911="* ]] + [[ "$output" == *"=/tmp/mount-multi2/bar.txt:2201=2922="* ]] + [[ "$output" == *"=/tmp/mount-multi2/baz.txt:2202=2933="* ]] + [[ "$output" == *"=/tmp/mount-multi3/foo.txt:3528=3491="* ]] + [[ "$output" == *"=/tmp/mount-multi3/bar.txt:3133=3337="* ]] + [[ "$output" == *"=/tmp/mount-multi3/baz.txt:3999=3444="* ]] } -@test "idmap mount without bind fails" { - update_config ' .mounts |= map((select(.source == "source-1/") | .options = [""]) // .)' +@test "multiple idmap mounts (different mappings) [no userns]" { + # Make sure we use different mappings for uids and gids. + setup_idmap_single_mount 100:1100:3 200:1900:50 multi1 + setup_idmap_single_mount 200:2200:3 200:2900:100 multi2 + setup_idmap_single_mount 5000000:3000:1000 6000000:3000:500 multi3 + + update_config '.process.args = ["bash", "-c", "stat -c =%n:%u=%g= /tmp/mount-multi[123]/{foo,bar,baz}.txt"]' runc run test_debian - [ "$status" -eq 1 ] - [[ "${output}" == *"invalid mount"* ]] + [ "$status" -eq 0 ] + [[ "$output" == *"=/tmp/mount-multi1/foo.txt:1100=1911="* ]] + [[ "$output" == *"=/tmp/mount-multi1/bar.txt:1101=1922="* ]] + [[ "$output" == *"=/tmp/mount-multi1/baz.txt:1102=1933="* ]] + [[ "$output" == *"=/tmp/mount-multi2/foo.txt:2200=2911="* ]] + [[ "$output" == *"=/tmp/mount-multi2/bar.txt:2201=2922="* ]] + [[ "$output" == *"=/tmp/mount-multi2/baz.txt:2202=2933="* ]] + [[ "$output" == *"=/tmp/mount-multi3/foo.txt:3528=3491="* ]] + [[ "$output" == *"=/tmp/mount-multi3/bar.txt:3133=3337="* ]] + [[ "$output" == *"=/tmp/mount-multi3/baz.txt:3999=3444="* ]] } -@test "idmap mount with different mapping than userns fails" { - # Let's modify the containerID to be 1, instead of 0 as it is in the - # userns config. - update_config ' .mounts |= map((select(.source == "source-1/") | .uidMappings[0]["containerID"] = 1 ) // .)' +@test "idmap mount (complicated mapping) [userns]" { + setup_idmap_userns + update_config '.mounts += [ + { + "source": "source-multi1/", + "destination": "/tmp/mount-multi1", + "options": ["bind"], + "uidMappings": [ + {"containerID": 100, "hostID": 101000, "size": 1}, + {"containerID": 101, "hostID": 102000, "size": 1}, + {"containerID": 102, "hostID": 103000, "size": 1} + ], + "gidMappings": [ + {"containerID": 210, "hostID": 101100, "size": 10}, + {"containerID": 220, "hostID": 102200, "size": 10}, + {"containerID": 230, "hostID": 103300, "size": 10} + ] + } + ]' + + update_config '.process.args = ["bash", "-c", "stat -c =%n:%u=%g= /tmp/mount-multi1/{foo,bar,baz}.txt"]' runc run test_debian - [ "$status" -eq 1 ] - [[ "${output}" == *"invalid mount"* ]] + [ "$status" -eq 0 ] + [[ "$output" == *"=/tmp/mount-multi1/foo.txt:1000=1101="* ]] + [[ "$output" == *"=/tmp/mount-multi1/bar.txt:2000=2202="* ]] + [[ "$output" == *"=/tmp/mount-multi1/baz.txt:3000=3303="* ]] +} + +@test "idmap mount (complicated mapping) [no userns]" { + update_config '.mounts += [ + { + "source": "source-multi1/", + "destination": "/tmp/mount-multi1", + "options": ["bind"], + "uidMappings": [ + {"containerID": 100, "hostID": 1000, "size": 1}, + {"containerID": 101, "hostID": 2000, "size": 1}, + {"containerID": 102, "hostID": 3000, "size": 1} + ], + "gidMappings": [ + {"containerID": 210, "hostID": 1100, "size": 10}, + {"containerID": 220, "hostID": 2200, "size": 10}, + {"containerID": 230, "hostID": 3300, "size": 10} + ] + } + ]' + + update_config '.process.args = ["bash", "-c", "stat -c =%n:%u=%g= /tmp/mount-multi1/{foo,bar,baz}.txt"]' + runc run test_debian + [ "$status" -eq 0 ] + [[ "$output" == *"=/tmp/mount-multi1/foo.txt:1000=1101="* ]] + [[ "$output" == *"=/tmp/mount-multi1/bar.txt:2000=2202="* ]] + [[ "$output" == *"=/tmp/mount-multi1/baz.txt:3000=3303="* ]] } diff --git a/tty.go b/tty.go index fba3e025bc0..c101aacb78b 100644 --- a/tty.go +++ b/tty.go @@ -100,7 +100,7 @@ func (t *tty) initHostConsole() error { } func (t *tty) recvtty(socket *os.File) (Err error) { - f, err := utils.RecvFd(socket) + f, err := utils.RecvFile(socket) if err != nil { return err }