diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 56bae35f36f..8d35c69c339 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -78,6 +78,15 @@ jobs:
       with:
         bats-version: 1.9.0
 
+    - name: procfs mount
+      run: |
+        # Get the list of mounts to help with debugging.
+        cat /proc/self/mounts
+        # Create a procfs mount that is not masked, to ensure that container
+        # procfs mounts will succeed.
+        sudo mkdir -p /tmp/.procfs-stashed-mount
+        sudo unshare -pf mount -t proc -o subset=pid proc /tmp/.procfs-stashed-mount
+
     - name: unit test
       if: matrix.rootless != 'rootless'
       run: sudo -E PATH="$PATH" -- make TESTFLAGS="${{ matrix.race }}" localunittest
diff --git a/Makefile b/Makefile
index c72dd2f414d..0d48fe8c521 100644
--- a/Makefile
+++ b/Makefile
@@ -105,7 +105,7 @@ unittest: runcimage
 		-t --privileged --rm \
 		-v /lib/modules:/lib/modules:ro \
 		-v $(CURDIR):/go/src/$(PROJECT) \
-		$(RUNC_IMAGE) make localunittest TESTFLAGS=$(TESTFLAGS)
+		$(RUNC_IMAGE) make localunittest TESTFLAGS="$(TESTFLAGS)"
 
 localunittest: all
 	$(GO) test -timeout 3m -tags "$(BUILDTAGS)" $(TESTFLAGS) -v ./...
@@ -115,7 +115,7 @@ integration: runcimage
 		-t --privileged --rm \
 		-v /lib/modules:/lib/modules:ro \
 		-v $(CURDIR):/go/src/$(PROJECT) \
-		$(RUNC_IMAGE) make localintegration TESTPATH=$(TESTPATH)
+		$(RUNC_IMAGE) make localintegration TESTPATH="$(TESTPATH)"
 
 localintegration: all
 	bats -t tests/integration$(TESTPATH)
diff --git a/contrib/cmd/recvtty/recvtty.go b/contrib/cmd/recvtty/recvtty.go
index 35c293de642..532a4c33904 100644
--- a/contrib/cmd/recvtty/recvtty.go
+++ b/contrib/cmd/recvtty/recvtty.go
@@ -98,7 +98,7 @@ func handleSingle(path string, noStdin bool) error {
 	defer socket.Close()
 
 	// Get the master file descriptor from runC.
-	master, err := utils.RecvFd(socket)
+	master, err := utils.RecvFile(socket)
 	if err != nil {
 		return err
 	}
@@ -171,7 +171,7 @@ func handleNull(path string) error {
 			defer socket.Close()
 
 			// Get the master file descriptor from runC.
-			master, err := utils.RecvFd(socket)
+			master, err := utils.RecvFile(socket)
 			if err != nil {
 				return
 			}
diff --git a/libcontainer/configs/mount_linux.go b/libcontainer/configs/mount_linux.go
index a4275df3a03..0cec2e5aed4 100644
--- a/libcontainer/configs/mount_linux.go
+++ b/libcontainer/configs/mount_linux.go
@@ -6,6 +6,14 @@ type Mount struct {
 	// Source path for the mount.
 	Source string `json:"source"`
 
+	// SourceFd is a open_tree(2)-style file descriptor created with the new
+	// mount API. If non-nil, this indicates that SourceFd is a configured
+	// mountfd that should be used for the mount into the container.
+	//
+	// Note that you cannot use /proc/self/fd/$n-style paths with
+	// open_tree(2)-style file descriptors.
+	SourceFd *int
+
 	// Destination path for the mount inside the container.
 	Destination string `json:"destination"`
 
@@ -34,13 +42,13 @@ type Mount struct {
 	// Note that, the underlying filesystem should support this feature to be
 	// used.
 	// Every mount point could have its own mapping.
-	UIDMappings []IDMap `json:"uidMappings,omitempty"`
+	UIDMappings []IDMap `json:"uid_mappings,omitempty"`
 
 	// GIDMappings is used to changing file group owners w/o calling chown.
 	// Note that, the underlying filesystem should support this feature to be
 	// used.
 	// Every mount point could have its own mapping.
-	GIDMappings []IDMap `json:"gidMappings,omitempty"`
+	GIDMappings []IDMap `json:"gid_mappings,omitempty"`
 }
 
 func (m *Mount) IsBind() bool {
diff --git a/libcontainer/configs/validate/validator.go b/libcontainer/configs/validate/validator.go
index 196b431dba1..d4ca4bd707e 100644
--- a/libcontainer/configs/validate/validator.go
+++ b/libcontainer/configs/validate/validator.go
@@ -259,24 +259,11 @@ func checkIDMapMounts(config *configs.Config, m *configs.Mount) error {
 	}
 
 	if !m.IsBind() {
-		return fmt.Errorf("gidMappings/uidMappings is supported only for mounts with the option 'bind'")
+		return fmt.Errorf("id-mapped mounts are only supported for bind-mounts")
 	}
 	if config.RootlessEUID {
-		return fmt.Errorf("gidMappings/uidMappings is not supported when runc is being launched with EUID != 0, needs CAP_SYS_ADMIN on the runc parent's user namespace")
+		return fmt.Errorf("id-mapped mounts are not supported for rootless containers")
 	}
-	if len(config.UIDMappings) == 0 || len(config.GIDMappings) == 0 {
-		return fmt.Errorf("not yet supported to use gidMappings/uidMappings in a mount without also using a user namespace")
-	}
-	if !sameMapping(config.UIDMappings, m.UIDMappings) {
-		return fmt.Errorf("not yet supported for the mount uidMappings to be different than user namespace uidMapping")
-	}
-	if !sameMapping(config.GIDMappings, m.GIDMappings) {
-		return fmt.Errorf("not yet supported for the mount gidMappings to be different than user namespace gidMapping")
-	}
-	if !filepath.IsAbs(m.Source) {
-		return fmt.Errorf("mount source not absolute")
-	}
-
 	return nil
 }
 
@@ -298,27 +285,6 @@ func mounts(config *configs.Config) error {
 	return nil
 }
 
-// sameMapping checks if the mappings are the same. If the mappings are the same
-// but in different order, it returns false.
-func sameMapping(a, b []configs.IDMap) bool {
-	if len(a) != len(b) {
-		return false
-	}
-
-	for i := range a {
-		if a[i].ContainerID != b[i].ContainerID {
-			return false
-		}
-		if a[i].HostID != b[i].HostID {
-			return false
-		}
-		if a[i].Size != b[i].Size {
-			return false
-		}
-	}
-	return true
-}
-
 func isHostNetNS(path string) (bool, error) {
 	const currentProcessNetns = "/proc/self/ns/net"
 
diff --git a/libcontainer/configs/validate/validator_test.go b/libcontainer/configs/validate/validator_test.go
index 58aae7a68f8..18f6814a380 100644
--- a/libcontainer/configs/validate/validator_test.go
+++ b/libcontainer/configs/validate/validator_test.go
@@ -436,22 +436,7 @@ func TestValidateIDMapMounts(t *testing.T) {
 			},
 		},
 		{
-			name:  "idmap mount without userns mappings",
-			isErr: true,
-			config: &configs.Config{
-				Mounts: []*configs.Mount{
-					{
-						Source:      "/abs/path/",
-						Destination: "/abs/path/",
-						Flags:       unix.MS_BIND,
-						UIDMappings: mapping,
-						GIDMappings: mapping,
-					},
-				},
-			},
-		},
-		{
-			name:  "idmap mounts with different userns and mount mappings",
+			name:  "idmap mounts without abs dest path",
 			isErr: true,
 			config: &configs.Config{
 				UIDMappings: mapping,
@@ -459,54 +444,40 @@ func TestValidateIDMapMounts(t *testing.T) {
 				Mounts: []*configs.Mount{
 					{
 						Source:      "/abs/path/",
-						Destination: "/abs/path/",
+						Destination: "./rel/path/",
 						Flags:       unix.MS_BIND,
-						UIDMappings: []configs.IDMap{
-							{
-								ContainerID: 10,
-								HostID:      10,
-								Size:        1,
-							},
-						},
+						UIDMappings: mapping,
 						GIDMappings: mapping,
 					},
 				},
 			},
 		},
 		{
-			name:  "idmap mounts with different userns and mount mappings",
-			isErr: true,
+			name: "simple idmap mount",
 			config: &configs.Config{
 				UIDMappings: mapping,
 				GIDMappings: mapping,
 				Mounts: []*configs.Mount{
 					{
-						Source:      "/abs/path/",
+						Source:      "/another-abs/path/",
 						Destination: "/abs/path/",
 						Flags:       unix.MS_BIND,
 						UIDMappings: mapping,
-						GIDMappings: []configs.IDMap{
-							{
-								ContainerID: 10,
-								HostID:      10,
-								Size:        1,
-							},
-						},
+						GIDMappings: mapping,
 					},
 				},
 			},
 		},
 		{
-			name:  "idmap mounts without abs source path",
-			isErr: true,
+			name: "idmap mount with more flags",
 			config: &configs.Config{
 				UIDMappings: mapping,
 				GIDMappings: mapping,
 				Mounts: []*configs.Mount{
 					{
-						Source:      "./rel/path/",
+						Source:      "/another-abs/path/",
 						Destination: "/abs/path/",
-						Flags:       unix.MS_BIND,
+						Flags:       unix.MS_BIND | unix.MS_RDONLY,
 						UIDMappings: mapping,
 						GIDMappings: mapping,
 					},
@@ -514,15 +485,12 @@ func TestValidateIDMapMounts(t *testing.T) {
 			},
 		},
 		{
-			name:  "idmap mounts without abs dest path",
-			isErr: true,
+			name: "idmap mount without userns mappings",
 			config: &configs.Config{
-				UIDMappings: mapping,
-				GIDMappings: mapping,
 				Mounts: []*configs.Mount{
 					{
 						Source:      "/abs/path/",
-						Destination: "./rel/path/",
+						Destination: "/abs/path/",
 						Flags:       unix.MS_BIND,
 						UIDMappings: mapping,
 						GIDMappings: mapping,
@@ -530,35 +498,46 @@ func TestValidateIDMapMounts(t *testing.T) {
 				},
 			},
 		},
-
 		{
-			name: "simple idmap mount",
+			name: "idmap mounts with different userns and mount mappings",
 			config: &configs.Config{
 				UIDMappings: mapping,
 				GIDMappings: mapping,
 				Mounts: []*configs.Mount{
 					{
-						Source:      "/another-abs/path/",
+						Source:      "/abs/path/",
 						Destination: "/abs/path/",
 						Flags:       unix.MS_BIND,
-						UIDMappings: mapping,
+						UIDMappings: []configs.IDMap{
+							{
+								ContainerID: 10,
+								HostID:      10,
+								Size:        1,
+							},
+						},
 						GIDMappings: mapping,
 					},
 				},
 			},
 		},
 		{
-			name: "idmap mount with more flags",
+			name: "idmap mounts with different userns and mount mappings",
 			config: &configs.Config{
 				UIDMappings: mapping,
 				GIDMappings: mapping,
 				Mounts: []*configs.Mount{
 					{
-						Source:      "/another-abs/path/",
+						Source:      "/abs/path/",
 						Destination: "/abs/path/",
-						Flags:       unix.MS_BIND | unix.MS_RDONLY,
+						Flags:       unix.MS_BIND,
 						UIDMappings: mapping,
-						GIDMappings: mapping,
+						GIDMappings: []configs.IDMap{
+							{
+								ContainerID: 10,
+								HostID:      10,
+								Size:        1,
+							},
+						},
 					},
 				},
 			},
diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go
index 889dadd34d1..11ea28afcad 100644
--- a/libcontainer/container_linux.go
+++ b/libcontainer/container_linux.go
@@ -2,7 +2,6 @@ package libcontainer
 
 import (
 	"bytes"
-	"encoding/json"
 	"errors"
 	"fmt"
 	"io"
@@ -45,6 +44,7 @@ type Container struct {
 	state                containerState
 	created              time.Time
 	fifo                 *os.File
+	safeExeFile          *os.File
 }
 
 // State represents a running container's state
@@ -316,6 +316,12 @@ func (c *Container) start(process *Process) (retErr error) {
 	if err != nil {
 		return fmt.Errorf("unable to create new parent process: %w", err)
 	}
+	// This is no longer needed after the process has been spawned. We also
+	// want to make sure that (especially in the case of O_TMPFILE descriptors)
+	// that we use a new copy for each execution, because an attacker
+	// overwriting our copy would be just as bad as overwiting the host runc
+	// binary if we re-use the copy.
+	defer c.clearSafeExe()
 
 	logsDone := parent.forwardChildLogs()
 	if logsDone != nil {
@@ -441,6 +447,125 @@ func (c *Container) includeExecFifo(cmd *exec.Cmd) error {
 	return nil
 }
 
+func (c *Container) clearSafeExe() {
+	if c.safeExeFile != nil {
+		_ = c.safeExeFile.Close()
+		c.safeExeFile = nil
+	}
+}
+
+// makeSafeExe makes a copy of /proc/self/exe that is safe to use when
+// executing inside a container. On modern kernels, this is a locked executable
+// memfd that contains a copy of /proc/self/exe. The returned string is a
+// /proc/self/fd/... path that can be used directly with "os/exec".Command().
+// For more details on why this is necessary, see CVE-2019-5736.
+func (c *Container) makeSafeExe() (path string, Err error) {
+	if c.safeExeFile == nil {
+		var err error
+		var sealFn func(**os.File) error
+
+		// Close safeExeFile if we fail to make it properly.
+		defer func() {
+			if c.safeExeFile != nil && Err != nil {
+				c.clearSafeExe()
+			}
+		}()
+
+		// First, try an executable memfd (supported since Linux 3.17).
+		c.safeExeFile, err = system.ExecutableMemfd("runc_cloned:/proc/self/exe", unix.MFD_ALLOW_SEALING|unix.MFD_CLOEXEC)
+		if err != nil {
+			logrus.Debugf("memfd cloned binary failed, falling back to O_TMPFILE: %v", err)
+		} else {
+			sealFn = func(f **os.File) error {
+				if err := (*f).Chmod(0o511); err != nil {
+					return fmt.Errorf("chmod memfd: %w", err)
+				}
+				// Try to set the newer memfd sealing flags, but we ignore
+				// errors because they are not needed and we want to continue
+				// to work on older kernels.
+				fd := (*f).Fd()
+				// F_SEAL_FUTURE_WRITE -- Linux 5.1
+				_, _ = unix.FcntlInt(fd, unix.F_ADD_SEALS, unix.F_SEAL_FUTURE_WRITE)
+				// F_SEAL_EXEC -- Linux 6.3
+				const F_SEAL_EXEC = 0x2000 //nolint:revive // this matches the unix.* name
+				_, _ = unix.FcntlInt(fd, unix.F_ADD_SEALS, F_SEAL_EXEC)
+				// Apply all original memfd seals.
+				_, err := unix.FcntlInt(fd, unix.F_ADD_SEALS, unix.F_SEAL_SEAL|unix.F_SEAL_SHRINK|unix.F_SEAL_GROW|unix.F_SEAL_WRITE)
+				return os.NewSyscallError("fcntl(F_ADD_SEALS)", err)
+			}
+		}
+
+		// Try to fallback to O_TMPFILE (supported since Linux 3.11).
+		if c.safeExeFile == nil {
+			var stat unix.Stat_t
+			c.safeExeFile, err = os.OpenFile(c.root, unix.O_TMPFILE|unix.O_RDWR|unix.O_EXCL|unix.O_CLOEXEC, 0o700)
+			if err != nil {
+				logrus.Debugf("O_TMPFILE cloned binary failed, falling back to mktemp(): %v", err)
+			} else if err := unix.Fstat(int(c.safeExeFile.Fd()), &stat); err != nil || stat.Nlink != 0 {
+				logrus.Debugf("O_TMPFILE cloned binary has non-zero nlink, falling back to mktemp(): %v", err)
+				c.clearSafeExe()
+			}
+
+			// Finally, fallback to a classic temporary file we unlink.
+			if c.safeExeFile == nil {
+				c.safeExeFile, err = os.CreateTemp(c.root, "runc.")
+				if err != nil {
+					return "", fmt.Errorf("could not clone binary: %w", err)
+				}
+				// Unlink the file and verify it was unlinked.
+				if err := os.Remove(c.safeExeFile.Name()); err != nil {
+					return "", fmt.Errorf("unlinking classic tmpfile: %w", err)
+				}
+				if err := unix.Fstat(int(c.safeExeFile.Fd()), &stat); err != nil {
+					return "", fmt.Errorf("classic tmpfile fstat: %w", err)
+				} else if stat.Nlink != 0 {
+					return "", fmt.Errorf("classic tmpfile %s has non-zero nlink after unlink", c.safeExeFile.Name())
+				}
+			}
+			sealFn = func(f **os.File) error {
+				if err := (*f).Chmod(0o511); err != nil {
+					return fmt.Errorf("chmod tmpfile: %w", err)
+				}
+				// When sealing an O_TMPFILE-style descriptor we need to
+				// re-open the path as O_PATH to clear the existing write
+				// handle we have.
+				opath, err := os.OpenFile(fmt.Sprintf("/proc/self/fd/%d", (*f).Fd()), unix.O_PATH|unix.O_CLOEXEC, 0)
+				if err != nil {
+					return fmt.Errorf("reopen tmpfile: %w", err)
+				}
+				_ = (*f).Close()
+				*f = opath
+				return nil
+			}
+		}
+
+		// Copy the contents of /proc/self/exe to the cloned fd.
+		srcExeFile, err := os.Open("/proc/self/exe")
+		if err != nil {
+			return "", fmt.Errorf("cannot open current process exe: %w", err)
+		}
+		defer srcExeFile.Close()
+		stat, err := srcExeFile.Stat()
+		if err != nil {
+			return "", fmt.Errorf("checking /proc/self/exe size: %w", err)
+		}
+		exeSize := stat.Size()
+
+		copied, err := system.Copy(c.safeExeFile, srcExeFile)
+		if err != nil {
+			return "", fmt.Errorf("copy binary: %w", err)
+		} else if copied != exeSize {
+			return "", fmt.Errorf("copied binary size mismatch: %d != %d", copied, exeSize)
+		}
+
+		// Seal the descriptor.
+		if err := sealFn(&c.safeExeFile); err != nil {
+			return "", fmt.Errorf("could not seal fd: %w", err)
+		}
+	}
+	return fmt.Sprintf("/proc/self/fd/%d", c.safeExeFile.Fd()), nil
+}
+
 func (c *Container) newParentProcess(p *Process) (parentProcess, error) {
 	parentInitPipe, childInitPipe, err := utils.NewSockPair("init")
 	if err != nil {
@@ -454,24 +579,12 @@ func (c *Container) newParentProcess(p *Process) (parentProcess, error) {
 	}
 	logFilePair := filePair{parentLogPipe, childLogPipe}
 
-	cmd := c.commandTemplate(p, childInitPipe, childLogPipe)
-	if !p.Init {
-		return c.newSetnsProcess(p, cmd, messageSockPair, logFilePair)
-	}
-
-	// We only set up fifoFd if we're not doing a `runc exec`. The historic
-	// reason for this is that previously we would pass a dirfd that allowed
-	// for container rootfs escape (and not doing it in `runc exec` avoided
-	// that problem), but we no longer do that. However, there's no need to do
-	// this for `runc exec` so we just keep it this way to be safe.
-	if err := c.includeExecFifo(cmd); err != nil {
-		return nil, fmt.Errorf("unable to setup exec fifo: %w", err)
+	exePath, err := c.makeSafeExe()
+	if err != nil {
+		return nil, fmt.Errorf("unable to create safe /proc/self/exe clone: %w", err)
 	}
-	return c.newInitProcess(p, cmd, messageSockPair, logFilePair)
-}
 
-func (c *Container) commandTemplate(p *Process, childInitPipe *os.File, childLogPipe *os.File) *exec.Cmd {
-	cmd := exec.Command("/proc/self/exe", "init")
+	cmd := exec.Command(exePath, "init")
 	cmd.Args[0] = os.Args[0]
 	cmd.Stdin = p.Stdin
 	cmd.Stdout = p.Stdout
@@ -500,118 +613,36 @@ func (c *Container) commandTemplate(p *Process, childInitPipe *os.File, childLog
 		"_LIBCONTAINER_LOGLEVEL="+p.LogLevel,
 	)
 
-	// NOTE: when running a container with no PID namespace and the parent process spawning the container is
-	// PID1 the pdeathsig is being delivered to the container's init process by the kernel for some reason
-	// even with the parent still running.
-	if c.config.ParentDeathSignal > 0 {
-		cmd.SysProcAttr.Pdeathsig = unix.Signal(c.config.ParentDeathSignal)
-	}
-	return cmd
-}
-
-// shouldSendMountSources says whether the child process must setup bind mounts with
-// the source pre-opened (O_PATH) in the host user namespace.
-// See https://github.com/opencontainers/runc/issues/2484
-func (c *Container) shouldSendMountSources() bool {
-	// Passing the mount sources via SCM_RIGHTS is only necessary when
-	// both userns and mntns are active.
-	if !c.config.Namespaces.Contains(configs.NEWUSER) ||
-		!c.config.Namespaces.Contains(configs.NEWNS) {
-		return false
-	}
-
-	// nsexec.c send_mountsources() requires setns(mntns) capabilities
-	// CAP_SYS_CHROOT and CAP_SYS_ADMIN.
-	if c.config.RootlessEUID {
-		return false
-	}
-
-	// We need to send sources if there are non-idmap bind-mounts.
-	for _, m := range c.config.Mounts {
-		if m.IsBind() && !m.IsIDMapped() {
-			return true
-		}
-	}
-
-	return false
-}
-
-// shouldSendIdmapSources says whether the child process must setup idmap mounts with
-// the mount_setattr already done in the host user namespace.
-func (c *Container) shouldSendIdmapSources() bool {
-	// nsexec.c mount_setattr() requires CAP_SYS_ADMIN in:
-	// * the user namespace the filesystem was mounted in;
-	// * the user namespace we're trying to idmap the mount to;
-	// * the owning user namespace of the mount namespace you're currently located in.
+	// Due to a Go stdlib bug, we need to add c.safeExeFile to the set of
+	// ExtraFiles otherwise it is possible for the stdlib to clobber the fd
+	// during forkAndExecInChild1 and replace it with some other file that
+	// might be malicious. This is less than ideal (because the descriptor will
+	// be non-O_CLOEXEC) however we have protections in "runc init" to stop us
+	// from leaking extra file descriptors.
 	//
-	// See the comment from Christian Brauner:
-	//	https://github.com/opencontainers/runc/pull/3717#discussion_r1103607972
-	//
-	// Let's just rule out rootless, we don't have those permission in the
-	// rootless case.
-	if c.config.RootlessEUID {
-		return false
-	}
+	// See <https://github.com/golang/go/issues/61751>.
+	cmd.ExtraFiles = append(cmd.ExtraFiles, c.safeExeFile)
 
-	// For the time being we require userns to be in use.
-	if !c.config.Namespaces.Contains(configs.NEWUSER) {
-		return false
+	// NOTE: when running a container with no PID namespace and the parent
+	//       process spawning the container is PID1 the pdeathsig is being
+	//       delivered to the container's init process by the kernel for some
+	//       reason even with the parent still running.
+	if c.config.ParentDeathSignal > 0 {
+		cmd.SysProcAttr.Pdeathsig = unix.Signal(c.config.ParentDeathSignal)
 	}
 
-	// We need to send sources if there are idmap bind-mounts.
-	for _, m := range c.config.Mounts {
-		if m.IsBind() && m.IsIDMapped() {
-			return true
+	if p.Init {
+		// We only set up fifoFd if we're not doing a `runc exec`. The historic
+		// reason for this is that previously we would pass a dirfd that allowed
+		// for container rootfs escape (and not doing it in `runc exec` avoided
+		// that problem), but we no longer do that. However, there's no need to do
+		// this for `runc exec` so we just keep it this way to be safe.
+		if err := c.includeExecFifo(cmd); err != nil {
+			return nil, fmt.Errorf("unable to setup exec fifo: %w", err)
 		}
+		return c.newInitProcess(p, cmd, messageSockPair, logFilePair)
 	}
-
-	return false
-}
-
-func (c *Container) sendMountSources(cmd *exec.Cmd, messageSockPair filePair) error {
-	if !c.shouldSendMountSources() {
-		return nil
-	}
-
-	return c.sendFdsSources(cmd, messageSockPair, "_LIBCONTAINER_MOUNT_FDS", func(m *configs.Mount) bool {
-		return m.IsBind() && !m.IsIDMapped()
-	})
-}
-
-func (c *Container) sendIdmapSources(cmd *exec.Cmd, messageSockPair filePair) error {
-	if !c.shouldSendIdmapSources() {
-		return nil
-	}
-
-	return c.sendFdsSources(cmd, messageSockPair, "_LIBCONTAINER_IDMAP_FDS", func(m *configs.Mount) bool {
-		return m.IsBind() && m.IsIDMapped()
-	})
-}
-
-func (c *Container) sendFdsSources(cmd *exec.Cmd, messageSockPair filePair, envVar string, condition func(*configs.Mount) bool) error {
-	// Elements on these slices will be paired with mounts (see StartInitialization() and
-	// prepareRootfs()). These slices MUST have the same size as c.config.Mounts.
-	fds := make([]int, len(c.config.Mounts))
-	for i, m := range c.config.Mounts {
-		if !condition(m) {
-			// The -1 fd is ignored later.
-			fds[i] = -1
-			continue
-		}
-
-		// The fd passed here will not be used: nsexec.c will overwrite it with dup3(). We just need
-		// to allocate a fd so that we know the number to pass in the environment variable. The fd
-		// must not be closed before cmd.Start(), so we reuse messageSockPair.child because the
-		// lifecycle of that fd is already taken care of.
-		cmd.ExtraFiles = append(cmd.ExtraFiles, messageSockPair.child)
-		fds[i] = stdioFdCount + len(cmd.ExtraFiles) - 1
-	}
-	fdsJSON, err := json.Marshal(fds)
-	if err != nil {
-		return fmt.Errorf("Error creating %v: %w", envVar, err)
-	}
-	cmd.Env = append(cmd.Env, envVar+"="+string(fdsJSON))
-	return nil
+	return c.newSetnsProcess(p, cmd, messageSockPair, logFilePair)
 }
 
 func (c *Container) newInitProcess(p *Process, cmd *exec.Cmd, messageSockPair, logFilePair filePair) (*initProcess, error) {
@@ -622,16 +653,10 @@ func (c *Container) newInitProcess(p *Process, cmd *exec.Cmd, messageSockPair, l
 			nsMaps[ns.Type] = ns.Path
 		}
 	}
-	data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps, initStandard)
+	data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps)
 	if err != nil {
 		return nil, err
 	}
-	if err := c.sendMountSources(cmd, messageSockPair); err != nil {
-		return nil, err
-	}
-	if err := c.sendIdmapSources(cmd, messageSockPair); err != nil {
-		return nil, err
-	}
 
 	init := &initProcess{
 		cmd:             cmd,
@@ -656,7 +681,7 @@ func (c *Container) newSetnsProcess(p *Process, cmd *exec.Cmd, messageSockPair,
 	}
 	// for setns process, we don't have to set cloneflags as the process namespaces
 	// will only be set via setns syscall
-	data, err := c.bootstrapData(0, state.NamespacePaths, initSetns)
+	data, err := c.bootstrapData(0, state.NamespacePaths)
 	if err != nil {
 		return nil, err
 	}
@@ -1046,7 +1071,7 @@ type netlinkError struct{ error }
 // such as one that uses nsenter package to bootstrap the container's
 // init process correctly, i.e. with correct namespaces, uid/gid
 // mapping etc.
-func (c *Container) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string, it initType) (_ io.Reader, Err error) {
+func (c *Container) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string) (_ io.Reader, Err error) {
 	// create the netlink message
 	r := nl.NewNetlinkRequest(int(InitMsg), 0)
 
@@ -1148,48 +1173,6 @@ func (c *Container) bootstrapData(cloneFlags uintptr, nsMaps map[configs.Namespa
 		Value: c.config.RootlessEUID,
 	})
 
-	// Bind mount source to open.
-	if it == initStandard && c.shouldSendMountSources() {
-		var mounts []byte
-		for _, m := range c.config.Mounts {
-			if m.IsBind() && !m.IsIDMapped() {
-				if strings.IndexByte(m.Source, 0) >= 0 {
-					return nil, fmt.Errorf("mount source string contains null byte: %q", m.Source)
-				}
-				mounts = append(mounts, []byte(m.Source)...)
-			}
-			mounts = append(mounts, byte(0))
-		}
-
-		r.AddData(&Bytemsg{
-			Type:  MountSourcesAttr,
-			Value: mounts,
-		})
-	}
-
-	// Idmap mount sources to open.
-	if it == initStandard && c.shouldSendIdmapSources() {
-		var mounts []byte
-		for _, m := range c.config.Mounts {
-			if m.IsBind() && m.IsIDMapped() {
-				// While other parts of the code check this too (like
-				// libcontainer/specconv/spec_linux.go) we do it here also because some libcontainer
-				// users don't use those functions.
-				if strings.IndexByte(m.Source, 0) >= 0 {
-					return nil, fmt.Errorf("mount source string contains null byte: %q", m.Source)
-				}
-
-				mounts = append(mounts, []byte(m.Source)...)
-			}
-			mounts = append(mounts, byte(0))
-		}
-
-		r.AddData(&Bytemsg{
-			Type:  IdmapSourcesAttr,
-			Value: mounts,
-		})
-	}
-
 	return bytes.NewReader(r.Serialize()), nil
 }
 
diff --git a/libcontainer/criu_linux.go b/libcontainer/criu_linux.go
index 31c8a900eb1..7b209aa07af 100644
--- a/libcontainer/criu_linux.go
+++ b/libcontainer/criu_linux.go
@@ -618,8 +618,8 @@ func (c *Container) prepareCriuRestoreMounts(mounts []*configs.Mount) error {
 			// because during initial container creation mounts are
 			// set up in the order they are configured.
 			if m.Device == "bind" {
-				if err := utils.WithProcfd(c.config.Rootfs, m.Destination, func(dstFD string) error {
-					return mountViaFDs(m.Source, nil, m.Destination, dstFD, "", unix.MS_BIND|unix.MS_REC, "")
+				if err := utils.WithProcfd(c.config.Rootfs, m.Destination, func(dstFd string) error {
+					return mountViaFds(m.Source, nil, m.Destination, dstFd, "", unix.MS_BIND|unix.MS_REC, "")
 				}); err != nil {
 					return err
 				}
@@ -1185,7 +1185,7 @@ func (c *Container) criuNotifications(resp *criurpc.CriuResp, process *Process,
 		defer master.Close()
 
 		// While we can access console.master, using the API is a good idea.
-		if err := utils.SendFd(process.ConsoleSocket, master.Name(), master.Fd()); err != nil {
+		if err := utils.SendFile(process.ConsoleSocket, master); err != nil {
 			return err
 		}
 	case "status-ready":
diff --git a/libcontainer/factory_linux.go b/libcontainer/factory_linux.go
index 4cb4a88bf0d..54a7bec91a1 100644
--- a/libcontainer/factory_linux.go
+++ b/libcontainer/factory_linux.go
@@ -214,18 +214,3 @@ func validateID(id string) error {
 
 	return nil
 }
-
-func parseFdsFromEnv(envVar string) ([]int, error) {
-	fdsJSON := os.Getenv(envVar)
-	if fdsJSON == "" {
-		// Always return the nil slice if no fd is present.
-		return nil, nil
-	}
-
-	var fds []int
-	if err := json.Unmarshal([]byte(fdsJSON), &fds); err != nil {
-		return nil, fmt.Errorf("Error unmarshalling %v: %w", envVar, err)
-	}
-
-	return fds, nil
-}
diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go
index 42cae1ccb65..d72ca8a3dc9 100644
--- a/libcontainer/init_linux.go
+++ b/libcontainer/init_linux.go
@@ -5,9 +5,9 @@ import (
 	"encoding/json"
 	"errors"
 	"fmt"
-	"io"
 	"net"
 	"os"
+	"runtime"
 	"runtime/debug"
 	"strconv"
 	"strings"
@@ -47,18 +47,6 @@ type network struct {
 	TempVethPeerName string `json:"temp_veth_peer_name"`
 }
 
-type mountFds struct {
-	// sourceFds are the fds to use as source when mounting.
-	// The slice size should be the same as container mounts, as it will be
-	// paired with them.
-	// The value -1 is used when no fd is needed for the mount.
-	// Can't have a valid fd in the same position that other slices in this struct.
-	// We need to use only one of these fds on any single mount.
-	sourceFds []int
-	// Idem sourceFds, but fds of already created idmap mounts, to use with unix.MoveMount().
-	idmapFds []int
-}
-
 // initConfig is used for transferring parameters from Exec() to Init()
 type initConfig struct {
 	Args             []string              `json:"args"`
@@ -102,11 +90,8 @@ func StartInitialization() (retErr error) {
 	defer func() {
 		// We have an error during the initialization of the container's init,
 		// send it back to the parent process in the form of an initError.
-		if err := writeSync(pipe, procError); err != nil {
-			fmt.Fprintln(os.Stderr, retErr)
-			return
-		}
-		if err := utils.WriteJSON(pipe, &initError{Message: retErr.Error()}); err != nil {
+		ierr := initError{Message: retErr.Error()}
+		if err := writeSyncArg(pipe, procError, ierr); err != nil {
 			fmt.Fprintln(os.Stderr, retErr)
 			return
 		}
@@ -139,18 +124,6 @@ func StartInitialization() (retErr error) {
 		return fmt.Errorf("unable to convert _LIBCONTAINER_LOGPIPE: %w", err)
 	}
 
-	// Get mount files (O_PATH).
-	mountSrcFds, err := parseFdsFromEnv("_LIBCONTAINER_MOUNT_FDS")
-	if err != nil {
-		return err
-	}
-
-	// Get idmap fds.
-	idmapFds, err := parseFdsFromEnv("_LIBCONTAINER_IDMAP_FDS")
-	if err != nil {
-		return err
-	}
-
 	// clear the current process's environment to clean any libcontainer
 	// specific env vars.
 	os.Clearenv()
@@ -166,10 +139,10 @@ func StartInitialization() (retErr error) {
 	}()
 
 	// If init succeeds, it will not return, hence none of the defers will be called.
-	return containerInit(it, pipe, consoleSocket, fifofd, logPipeFd, mountFds{sourceFds: mountSrcFds, idmapFds: idmapFds})
+	return containerInit(it, pipe, consoleSocket, fifofd, logPipeFd)
 }
 
-func containerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd, logFd int, mountFds mountFds) error {
+func containerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd, logFd int) error {
 	var config *initConfig
 	if err := json.NewDecoder(pipe).Decode(&config); err != nil {
 		return err
@@ -179,11 +152,6 @@ func containerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd, lo
 	}
 	switch t {
 	case initSetns:
-		// mount and idmap fds must be nil in this case. We don't mount while doing runc exec.
-		if mountFds.sourceFds != nil || mountFds.idmapFds != nil {
-			return errors.New("mount and idmap fds must be nil; can't mount from exec")
-		}
-
 		i := &linuxSetnsInit{
 			pipe:          pipe,
 			consoleSocket: consoleSocket,
@@ -199,7 +167,6 @@ func containerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd, lo
 			config:        config,
 			fifoFd:        fifoFd,
 			logFd:         logFd,
-			mountFds:      mountFds,
 		}
 		return i.Init()
 	}
@@ -317,7 +284,6 @@ func setupConsole(socket *os.File, config *initConfig, mount bool) error {
 	if err != nil {
 		return err
 	}
-
 	// After we return from here, we don't need the console anymore.
 	defer pty.Close()
 
@@ -339,9 +305,11 @@ func setupConsole(socket *os.File, config *initConfig, mount bool) error {
 		}
 	}
 	// While we can access console.master, using the API is a good idea.
-	if err := utils.SendFd(socket, pty.Name(), pty.Fd()); err != nil {
+	if err := utils.SendRawFd(socket, pty.Name(), pty.Fd()); err != nil {
 		return err
 	}
+	runtime.KeepAlive(pty)
+
 	// Now, dup over all the things.
 	return dupStdio(slavePath)
 }
@@ -349,12 +317,11 @@ func setupConsole(socket *os.File, config *initConfig, mount bool) error {
 // syncParentReady sends to the given pipe a JSON payload which indicates that
 // the init is ready to Exec the child process. It then waits for the parent to
 // indicate that it is cleared to Exec.
-func syncParentReady(pipe io.ReadWriter) error {
+func syncParentReady(pipe *os.File) error {
 	// Tell parent.
 	if err := writeSync(pipe, procReady); err != nil {
 		return err
 	}
-
 	// Wait for parent to give the all-clear.
 	return readSync(pipe, procRun)
 }
@@ -362,44 +329,37 @@ func syncParentReady(pipe io.ReadWriter) error {
 // syncParentHooks sends to the given pipe a JSON payload which indicates that
 // the parent should execute pre-start hooks. It then waits for the parent to
 // indicate that it is cleared to resume.
-func syncParentHooks(pipe io.ReadWriter) error {
+func syncParentHooks(pipe *os.File) error {
 	// Tell parent.
 	if err := writeSync(pipe, procHooks); err != nil {
 		return err
 	}
-
 	// Wait for parent to give the all-clear.
 	return readSync(pipe, procResume)
 }
 
-// syncParentSeccomp sends to the given pipe a JSON payload which
-// indicates that the parent should pick up the seccomp fd with pidfd_getfd()
-// and send it to the seccomp agent over a unix socket. It then waits for
-// the parent to indicate that it is cleared to resume and closes the seccompFd.
-// If the seccompFd is -1, there isn't anything to sync with the parent, so it
-// returns no error.
-func syncParentSeccomp(pipe io.ReadWriter, seccompFd int) error {
-	if seccompFd == -1 {
+// syncParentSeccomp sends the fd associated with the seccomp file descriptor
+// to the parent, and wait for the parent to do pidfd_getfd() to grab a copy.
+func syncParentSeccomp(pipe *os.File, seccompFd *os.File) error {
+	if seccompFd == nil {
 		return nil
 	}
-
-	// Tell parent.
-	if err := writeSyncWithFd(pipe, procSeccomp, seccompFd); err != nil {
-		unix.Close(seccompFd)
+	defer seccompFd.Close()
+
+	// Tell parent to grab our fd.
+	//
+	// Notably, we do not use writeSyncFile here because a container might have
+	// an SCMP_ACT_NOTIFY action on sendmsg(2) so we need to use the smallest
+	// possible number of system calls here because all of those syscalls
+	// cannot be used with SCMP_ACT_NOTIFY as a result (any syscall we use here
+	// before the parent gets the file descriptor would deadlock "runc init" if
+	// we allowed it for SCMP_ACT_NOTIFY). See seccomp.InitSeccomp() for more
+	// details.
+	if err := writeSyncArg(pipe, procSeccomp, seccompFd.Fd()); err != nil {
 		return err
 	}
-
-	// Wait for parent to give the all-clear.
-	if err := readSync(pipe, procSeccompDone); err != nil {
-		unix.Close(seccompFd)
-		return fmt.Errorf("sync parent seccomp: %w", err)
-	}
-
-	if err := unix.Close(seccompFd); err != nil {
-		return fmt.Errorf("close seccomp fd: %w", err)
-	}
-
-	return nil
+	// Wait for parent to tell us they've grabbed the seccompfd.
+	return readSync(pipe, procSeccompDone)
 }
 
 // setupUser changes the groups, gid, and uid for the user inside the container
diff --git a/libcontainer/integration/execin_test.go b/libcontainer/integration/execin_test.go
index 58519a419c0..c5c324130c6 100644
--- a/libcontainer/integration/execin_test.go
+++ b/libcontainer/integration/execin_test.go
@@ -277,9 +277,9 @@ func TestExecInTTY(t *testing.T) {
 
 		done := make(chan (error))
 		go func() {
-			f, err := utils.RecvFd(parent)
+			f, err := utils.RecvFile(parent)
 			if err != nil {
-				done <- fmt.Errorf("RecvFd: %w", err)
+				done <- fmt.Errorf("RecvFile: %w", err)
 				return
 			}
 			c, err := console.ConsoleFromFile(f)
diff --git a/libcontainer/message_linux.go b/libcontainer/message_linux.go
index 17db81a29f3..d78e0328a20 100644
--- a/libcontainer/message_linux.go
+++ b/libcontainer/message_linux.go
@@ -21,8 +21,6 @@ const (
 	RootlessEUIDAttr uint16 = 27287
 	UidmapPathAttr   uint16 = 27288
 	GidmapPathAttr   uint16 = 27289
-	MountSourcesAttr uint16 = 27290
-	IdmapSourcesAttr uint16 = 27291
 )
 
 type Int32msg struct {
diff --git a/libcontainer/mount_linux.go b/libcontainer/mount_linux.go
index da11da68ea0..ce71f63ad68 100644
--- a/libcontainer/mount_linux.go
+++ b/libcontainer/mount_linux.go
@@ -1,22 +1,47 @@
 package libcontainer
 
 import (
+	"errors"
+	"fmt"
 	"io/fs"
+	"os"
 	"strconv"
 
+	"github.com/sirupsen/logrus"
 	"golang.org/x/sys/unix"
+
+	"github.com/opencontainers/runc/libcontainer/configs"
+	"github.com/opencontainers/runc/libcontainer/userns"
+)
+
+// mountSourceType indicates what type of file descriptor is being returned. It
+// is used to tell rootfs_linux.go whether or not to use move_mount(2) to
+// install the mount.
+type mountSourceType string
+
+const (
+	// An open_tree(2)-style file descriptor that needs to be installed using
+	// move_mount(2) to install.
+	mountSourceOpenTree mountSourceType = "open_tree"
+	// A plain file descriptor that can be mounted through /proc/self/fd.
+	mountSourcePlain mountSourceType = "plain-open"
 )
 
+type mountSource struct {
+	Type mountSourceType `json:"source_type"`
+	file *os.File        `json:"-"`
+}
+
 // mountError holds an error from a failed mount or unmount operation.
 type mountError struct {
-	op     string
-	source string
-	srcFD  *int
-	target string
-	dstFD  string
-	flags  uintptr
-	data   string
-	err    error
+	op      string
+	source  string
+	srcFile *mountSource
+	target  string
+	dstFd   string
+	flags   uintptr
+	data    string
+	err     error
 }
 
 // Error provides a string error representation.
@@ -25,13 +50,14 @@ func (e *mountError) Error() string {
 
 	if e.source != "" {
 		out += "src=" + e.source + ", "
-		if e.srcFD != nil {
-			out += "srcFD=" + strconv.Itoa(*e.srcFD) + ", "
+		if e.srcFile != nil {
+			out += "srcType=" + string(e.srcFile.Type) + ", "
+			out += "srcFd=" + strconv.Itoa(int(e.srcFile.file.Fd())) + ", "
 		}
 	}
 	out += "dst=" + e.target
-	if e.dstFD != "" {
-		out += ", dstFD=" + e.dstFD
+	if e.dstFd != "" {
+		out += ", dstFd=" + e.dstFd
 	}
 
 	if e.flags != uintptr(0) {
@@ -54,38 +80,58 @@ func (e *mountError) Unwrap() error {
 // mount is a simple unix.Mount wrapper, returning an error with more context
 // in case it failed.
 func mount(source, target, fstype string, flags uintptr, data string) error {
-	return mountViaFDs(source, nil, target, "", fstype, flags, data)
+	return mountViaFds(source, nil, target, "", fstype, flags, data)
 }
 
-// mountViaFDs is a unix.Mount wrapper which uses srcFD instead of source,
-// and dstFD instead of target, unless those are empty.
-// If srcFD is different than nil, its path (i.e. "/proc/self/fd/NN") will be
-// constructed by this function.
-// dstFD argument, if non-empty, is expected to be in the form of a path to an
-// opened file descriptor on procfs (i.e. "/proc/self/fd/NN").
+// mountViaFds is a unix.Mount wrapper which uses srcFile instead of source,
+// and dstFd instead of target, unless those are empty.
 //
-// If case an FD is used instead of a source or a target path, the
-// corresponding path is only used to add context to an error in case
-// the mount operation has failed.
-func mountViaFDs(source string, srcFD *int, target, dstFD, fstype string, flags uintptr, data string) error {
-	src := source
-	if srcFD != nil {
-		src = "/proc/self/fd/" + strconv.Itoa(*srcFD)
+// If srcFile is non-nil and flags does not contain MS_REMOUNT, mountViaFds
+// will mount it according to the mountSourceType of the file descriptor.
+//
+// The dstFd argument, if non-empty, is expected to be in the form of a path to
+// an opened file descriptor on procfs (i.e. "/proc/self/fd/NN").
+//
+// If a file descriptor is used instead of a source or a target path, the
+// corresponding path is only used to add context to an error in case the mount
+// operation has failed.
+func mountViaFds(source string, srcFile *mountSource, target, dstFd, fstype string, flags uintptr, data string) error {
+	// MS_REMOUNT and srcFile don't make sense together.
+	if srcFile != nil && flags&unix.MS_REMOUNT != 0 {
+		logrus.Debugf("mount source passed along with MS_REMOUNT -- ignoring srcFile")
+		srcFile = nil
 	}
+
 	dst := target
-	if dstFD != "" {
-		dst = dstFD
+	if dstFd != "" {
+		dst = dstFd
+	}
+	src := source
+	if srcFile != nil {
+		src = "/proc/self/fd/" + strconv.Itoa(int(srcFile.file.Fd()))
 	}
-	if err := unix.Mount(src, dst, fstype, flags, data); err != nil {
+
+	var op string
+	var err error
+	if srcFile != nil && srcFile.Type == mountSourceOpenTree {
+		op = "move_mount"
+		err = unix.MoveMount(int(srcFile.file.Fd()), "",
+			unix.AT_FDCWD, dstFd,
+			unix.MOVE_MOUNT_F_EMPTY_PATH|unix.MOVE_MOUNT_T_SYMLINKS)
+	} else {
+		op = "mount"
+		err = unix.Mount(src, dst, fstype, flags, data)
+	}
+	if err != nil {
 		return &mountError{
-			op:     "mount",
-			source: source,
-			srcFD:  srcFD,
-			target: target,
-			dstFD:  dstFD,
-			flags:  flags,
-			data:   data,
-			err:    err,
+			op:      op,
+			source:  source,
+			srcFile: srcFile,
+			target:  target,
+			dstFd:   dstFd,
+			flags:   flags,
+			data:    data,
+			err:     err,
 		}
 	}
 	return nil
@@ -121,3 +167,75 @@ func syscallMode(i fs.FileMode) (o uint32) {
 	// No mapping for Go's ModeTemporary (plan9 only).
 	return
 }
+
+// mountFd creates an open_tree(2)-like mount fd from the provided
+// configuration. This function must be called from within the container's
+// mount namespace.
+func mountFd(nsHandles *userns.Handles, m *configs.Mount) (mountSource, error) {
+	if !m.IsBind() {
+		return mountSource{}, errors.New("new mount api: only bind-mounts are supported")
+	}
+	if nsHandles == nil {
+		nsHandles = new(userns.Handles)
+		defer nsHandles.Release()
+	}
+
+	var mountFile *os.File
+	var sourceType mountSourceType
+
+	if m.IsBind() {
+		// We only need to use OPEN_TREE_CLONE in the case where we need to use
+		// mount_setattr(2). We are currently in the container namespace and
+		// there is no risk of an opened directory being used to escape the
+		// container. OPEN_TREE_CLONE is more expensive than open(2) because it
+		// requires doing mounts inside a new anonymous mount namespace.
+		if m.IsIDMapped() {
+			flags := uint(unix.OPEN_TREE_CLONE | unix.O_CLOEXEC)
+			if m.Flags&unix.MS_REC == unix.MS_REC {
+				flags |= unix.AT_RECURSIVE
+			}
+			mountFd, err := unix.OpenTree(unix.AT_FDCWD, m.Source, flags)
+			if err != nil {
+				return mountSource{}, &os.PathError{Op: "open_tree(OPEN_TREE_CLONE)", Path: m.Source, Err: err}
+			}
+			mountFile = os.NewFile(uintptr(mountFd), m.Source)
+			sourceType = mountSourceOpenTree
+		} else {
+			var err error
+			mountFile, err = os.OpenFile(m.Source, unix.O_PATH|unix.O_CLOEXEC, 0)
+			if err != nil {
+				return mountSource{}, err
+			}
+			sourceType = mountSourcePlain
+		}
+	}
+
+	if m.IsIDMapped() {
+		if mountFile == nil {
+			return mountSource{}, fmt.Errorf("invalid mount source %q: id-mapping of non-bind-mounts is not supported", m.Source)
+		}
+		if sourceType != mountSourceOpenTree {
+			// should never happen
+			return mountSource{}, fmt.Errorf("invalid mount source %q: id-mapped target mistakenly opened without OPEN_TREE_CLONE", m.Source)
+		}
+
+		usernsFile, err := nsHandles.Get(userns.Mapping{
+			UIDMappings: m.UIDMappings,
+			GIDMappings: m.GIDMappings,
+		})
+		if err != nil {
+			return mountSource{}, fmt.Errorf("failed to create userns for %s id-mapping: %w", m.Source, err)
+		}
+		defer usernsFile.Close()
+		if err := unix.MountSetattr(int(mountFile.Fd()), "", unix.AT_EMPTY_PATH, &unix.MountAttr{
+			Attr_set:  unix.MOUNT_ATTR_IDMAP,
+			Userns_fd: uint64(usernsFile.Fd()),
+		}); err != nil {
+			return mountSource{}, fmt.Errorf("failed to set IDMAP_SOURCE_ATTR on %s: %w", m.Source, err)
+		}
+	}
+	return mountSource{
+		Type: sourceType,
+		file: mountFile,
+	}, nil
+}
diff --git a/libcontainer/nsenter/cloned_binary.c b/libcontainer/nsenter/cloned_binary.c
deleted file mode 100644
index a7f992fddd7..00000000000
--- a/libcontainer/nsenter/cloned_binary.c
+++ /dev/null
@@ -1,567 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0 OR LGPL-2.1-or-later
-/*
- * Copyright (C) 2019 Aleksa Sarai <cyphar@cyphar.com>
- * Copyright (C) 2019 SUSE LLC
- *
- * This work is dual licensed under the following licenses. You may use,
- * redistribute, and/or modify the work under the conditions of either (or
- * both) licenses.
- *
- * === Apache-2.0 ===
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- * === LGPL-2.1-or-later ===
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library. If not, see
- * <https://www.gnu.org/licenses/>.
- *
- */
-
-#define _GNU_SOURCE
-#include <unistd.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdbool.h>
-#include <string.h>
-#include <limits.h>
-#include <fcntl.h>
-#include <errno.h>
-
-#include <sched.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/statfs.h>
-#include <sys/vfs.h>
-#include <sys/mman.h>
-#include <sys/mount.h>
-#include <sys/sendfile.h>
-#include <sys/socket.h>
-#include <sys/syscall.h>
-#include <sys/wait.h>
-
-#include "ipc.h"
-#include "log.h"
-
-/* Use our own wrapper for memfd_create. */
-#ifndef SYS_memfd_create
-#  ifdef __NR_memfd_create
-#    define SYS_memfd_create __NR_memfd_create
-#  else
-/* These values come from <https://fedora.juszkiewicz.com.pl/syscalls.html>. */
-#    warning "libc is outdated -- using hard-coded SYS_memfd_create"
-#    if defined(__x86_64__)
-#      define SYS_memfd_create 319
-#    elif defined(__i386__)
-#      define SYS_memfd_create 356
-#    elif defined(__ia64__)
-#      define SYS_memfd_create 1340
-#    elif defined(__arm__)
-#      define SYS_memfd_create 385
-#    elif defined(__aarch64__)
-#      define SYS_memfd_create 279
-#    elif defined(__ppc__) || defined(__PPC64__) || defined(__powerpc64__)
-#      define SYS_memfd_create 360
-#    elif defined(__s390__) || defined(__s390x__)
-#      define SYS_memfd_create 350
-#    else
-#      warning "unknown architecture -- cannot hard-code SYS_memfd_create"
-#    endif
-#  endif
-#endif
-
-/* memfd_create(2) flags -- copied from <linux/memfd.h>. */
-#ifndef MFD_CLOEXEC
-#  define MFD_CLOEXEC       0x0001U
-#  define MFD_ALLOW_SEALING 0x0002U
-#endif
-#ifndef MFD_EXEC
-#  define MFD_EXEC          0x0010U
-#endif
-
-int memfd_create(const char *name, unsigned int flags)
-{
-#ifdef SYS_memfd_create
-	return syscall(SYS_memfd_create, name, flags);
-#else
-	errno = ENOSYS;
-	return -1;
-#endif
-}
-
-/* This comes directly from <linux/fcntl.h>. */
-#ifndef F_LINUX_SPECIFIC_BASE
-#  define F_LINUX_SPECIFIC_BASE 1024
-#endif
-#ifndef F_ADD_SEALS
-#  define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9)
-#  define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10)
-#endif
-#ifndef F_SEAL_SEAL
-#  define F_SEAL_SEAL          0x0001	/* prevent further seals from being set */
-#  define F_SEAL_SHRINK        0x0002	/* prevent file from shrinking */
-#  define F_SEAL_GROW          0x0004	/* prevent file from growing */
-#  define F_SEAL_WRITE         0x0008	/* prevent writes */
-#endif
-#ifndef F_SEAL_FUTURE_WRITE
-#  define F_SEAL_FUTURE_WRITE  0x0010	/* prevent future writes while mapped */
-#endif
-#ifndef F_SEAL_EXEC
-#  define F_SEAL_EXEC          0x0020	/* prevent chmod modifying exec bits */
-#endif
-
-#define CLONED_BINARY_ENV "_LIBCONTAINER_CLONED_BINARY"
-#define RUNC_MEMFD_COMMENT "runc_cloned:/proc/self/exe"
-/*
- * There are newer memfd seals (such as F_SEAL_FUTURE_WRITE and F_SEAL_EXEC),
- * which we use opportunistically. However, this set is the original set of
- * memfd seals, and we require them all to be set to trust our /proc/self/exe
- * if it is a memfd.
- */
-#define RUNC_MEMFD_MIN_SEALS \
-	(F_SEAL_SEAL | F_SEAL_SHRINK | F_SEAL_GROW | F_SEAL_WRITE)
-
-static void *must_realloc(void *ptr, size_t size)
-{
-	void *old = ptr;
-	do {
-		ptr = realloc(old, size);
-	} while (!ptr);
-	return ptr;
-}
-
-/*
- * Verify whether we are currently in a self-cloned program (namely, is
- * /proc/self/exe a memfd). F_GET_SEALS will only succeed for memfds (or rather
- * for shmem files), and we want to be sure it's actually sealed.
- */
-static int is_self_cloned(void)
-{
-	int fd, seals = 0, is_cloned = false;
-	struct stat statbuf = { };
-	struct statfs fsbuf = { };
-
-	fd = open("/proc/self/exe", O_RDONLY | O_CLOEXEC);
-	if (fd < 0) {
-		write_log(ERROR, "cannot open runc binary for reading: open /proc/self/exe: %m");
-		return -ENOTRECOVERABLE;
-	}
-
-	/*
-	 * Is the binary a fully-sealed memfd? We don't need CLONED_BINARY_ENV for
-	 * this, because you cannot write to a sealed memfd no matter what.
-	 */
-	seals = fcntl(fd, F_GET_SEALS);
-	if (seals >= 0) {
-		write_log(DEBUG, "checking /proc/self/exe memfd seals: 0x%x", seals);
-		is_cloned = (seals & RUNC_MEMFD_MIN_SEALS) == RUNC_MEMFD_MIN_SEALS;
-		if (is_cloned)
-			goto out;
-	}
-
-	/*
-	 * All other forms require CLONED_BINARY_ENV, since they are potentially
-	 * writeable (or we can't tell if they're fully safe) and thus we must
-	 * check the environment as an extra layer of defence.
-	 */
-	if (!getenv(CLONED_BINARY_ENV)) {
-		is_cloned = false;
-		goto out;
-	}
-
-	/*
-	 * Is the binary on a read-only filesystem? We can't detect bind-mounts in
-	 * particular (in-kernel they are identical to regular mounts) but we can
-	 * at least be sure that it's read-only. In addition, to make sure that
-	 * it's *our* bind-mount we check CLONED_BINARY_ENV.
-	 */
-	if (fstatfs(fd, &fsbuf) >= 0)
-		is_cloned |= (fsbuf.f_flags & MS_RDONLY);
-
-	/*
-	 * Okay, we're a tmpfile -- or we're currently running on RHEL <=7.6
-	 * which appears to have a borked backport of F_GET_SEALS. Either way,
-	 * having a file which has no hardlinks indicates that we aren't using
-	 * a host-side "runc" binary and this is something that a container
-	 * cannot fake (because unlinking requires being able to resolve the
-	 * path that you want to unlink).
-	 */
-	if (fstat(fd, &statbuf) >= 0)
-		is_cloned |= (statbuf.st_nlink == 0);
-
-out:
-	close(fd);
-	return is_cloned;
-}
-
-/* Read a given file into a new buffer, and providing the length. */
-static char *read_file(char *path, size_t *length)
-{
-	int fd;
-	char buf[4096], *copy = NULL;
-
-	if (!length)
-		return NULL;
-
-	fd = open(path, O_RDONLY | O_CLOEXEC);
-	if (fd < 0)
-		return NULL;
-
-	*length = 0;
-	for (;;) {
-		ssize_t n;
-
-		n = read(fd, buf, sizeof(buf));
-		if (n < 0)
-			goto error;
-		if (!n)
-			break;
-
-		copy = must_realloc(copy, (*length + n) * sizeof(*copy));
-		memcpy(copy + *length, buf, n);
-		*length += n;
-	}
-	close(fd);
-	return copy;
-
-error:
-	close(fd);
-	free(copy);
-	return NULL;
-}
-
-/*
- * A poor-man's version of "xargs -0". Basically parses a given block of
- * NUL-delimited data, within the given length and adds a pointer to each entry
- * to the array of pointers.
- */
-static int parse_xargs(char *data, int data_length, char ***output)
-{
-	int num = 0;
-	char *cur = data;
-
-	if (!data || *output != NULL)
-		return -1;
-
-	while (cur < data + data_length) {
-		num++;
-		*output = must_realloc(*output, (num + 1) * sizeof(**output));
-		(*output)[num - 1] = cur;
-		cur += strlen(cur) + 1;
-	}
-	(*output)[num] = NULL;
-	return num;
-}
-
-/*
- * "Parse" out argv from /proc/self/cmdline.
- * This is necessary because we are running in a context where we don't have a
- * main() that we can just get the arguments from.
- */
-static int fetchve(char ***argv)
-{
-	char *cmdline = NULL;
-	size_t cmdline_size;
-
-	cmdline = read_file("/proc/self/cmdline", &cmdline_size);
-	if (!cmdline)
-		goto error;
-
-	if (parse_xargs(cmdline, cmdline_size, argv) <= 0)
-		goto error;
-
-	return 0;
-
-error:
-	free(cmdline);
-	return -EINVAL;
-}
-
-enum {
-	EFD_NONE = 0,
-	EFD_MEMFD,
-	EFD_FILE,
-};
-
-/*
- * This comes from <linux/fcntl.h>. We can't hard-code __O_TMPFILE because it
- * changes depending on the architecture. If we don't have O_TMPFILE we always
- * have the mkostemp(3) fallback.
- */
-#ifndef O_TMPFILE
-#  if defined(__O_TMPFILE) && defined(O_DIRECTORY)
-#    define O_TMPFILE (__O_TMPFILE | O_DIRECTORY)
-#  endif
-#endif
-
-static inline bool is_memfd_unsupported_error(int err)
-{
-	/*
-	 * - ENOSYS is obviously an "unsupported" error.
-	 *
-	 * - EINVAL could be hit if MFD_EXEC is not supported (pre-6.3 kernel),
-	 *   but it can also be hit if vm.memfd_noexec=2 (in kernels without
-	 *   [1] applied) and the flags does not contain MFD_EXEC. However,
-	 *   there was a bug in the original 6.3 implementation of
-	 *   vm.memfd_noexec=2, which meant that MFD_EXEC would work even in
-	 *   the "strict" mode. Because we try MFD_EXEC first, we won't get
-	 *   EINVAL in the vm.memfd_noexec=2 case (which means we don't need to
-	 *   figure out whether to log the message about memfd_create).
-	 *
-	 * - EACCES is returned in kernels that contain [1] in the
-	 *   vm.memfd_noexec=2 case.
-	 *
-	 * At time of writing, [1] is not in Linus's tree and it't not clear if
-	 * it will be backported to stable, so what exact versions apply here
-	 * is unclear. But the bug is present in 6.3-6.5 at the very least.
-	 *
-	 * [1]: https://lore.kernel.org/all/20230705063315.3680666-2-jeffxu@google.com/
-	 */
-	if (err == EACCES)
-		write_log(INFO,
-			  "memfd_create(MFD_EXEC) failed, possibly due to vm.memfd_noexec=2 -- falling back to less secure O_TMPFILE");
-	return err == ENOSYS || err == EINVAL || err == EACCES;
-}
-
-static int make_execfd(int *fdtype)
-{
-	int fd = -1;
-	char template[PATH_MAX] = { 0 };
-	char *prefix = getenv("_LIBCONTAINER_STATEDIR");
-
-	if (!prefix || *prefix != '/')
-		prefix = "/tmp";
-	if (snprintf(template, sizeof(template), "%s/runc.XXXXXX", prefix) < 0)
-		return -1;
-
-	/*
-	 * Now try memfd, it's much nicer than actually creating a file in STATEDIR
-	 * since it's easily detected thanks to sealing and also doesn't require
-	 * assumptions about STATEDIR.
-	 */
-	*fdtype = EFD_MEMFD;
-	/*
-	 * On newer kernels we should set MFD_EXEC to indicate we need +x
-	 * permissions. Otherwise an admin with vm.memfd_noexec=1 would subtly
-	 * break runc. vm.memfd_noexec=2 is a little bit more complicated, see the
-	 * comment in is_memfd_unsupported_error() -- the upshot is that doing it
-	 * this way works, but only because of two overlapping bugs in the sysctl
-	 * implementation.
-	 */
-	fd = memfd_create(RUNC_MEMFD_COMMENT, MFD_EXEC | MFD_CLOEXEC | MFD_ALLOW_SEALING);
-	if (fd < 0 && is_memfd_unsupported_error(errno))
-		fd = memfd_create(RUNC_MEMFD_COMMENT, MFD_CLOEXEC | MFD_ALLOW_SEALING);
-	if (fd >= 0)
-		return fd;
-	if (!is_memfd_unsupported_error(errno))
-		goto error;
-
-#ifdef O_TMPFILE
-	/*
-	 * Try O_TMPFILE to avoid races where someone might snatch our file. Note
-	 * that O_EXCL isn't actually a security measure here (since you can just
-	 * fd re-open it and clear O_EXCL).
-	 */
-	*fdtype = EFD_FILE;
-	fd = open(prefix, O_TMPFILE | O_EXCL | O_RDWR | O_CLOEXEC, 0700);
-	if (fd >= 0) {
-		struct stat statbuf = { };
-		bool working_otmpfile = false;
-
-		/*
-		 * open(2) ignores unknown O_* flags -- yeah, I was surprised when I
-		 * found this out too. As a result we can't check for EINVAL. However,
-		 * if we get nlink != 0 (or EISDIR) then we know that this kernel
-		 * doesn't support O_TMPFILE.
-		 */
-		if (fstat(fd, &statbuf) >= 0)
-			working_otmpfile = (statbuf.st_nlink == 0);
-
-		if (working_otmpfile)
-			return fd;
-
-		/* Pretend that we got EISDIR since O_TMPFILE failed. */
-		close(fd);
-		errno = EISDIR;
-	}
-	if (errno != EISDIR)
-		goto error;
-#endif /* defined(O_TMPFILE) */
-
-	/*
-	 * Our final option is to create a temporary file the old-school way, and
-	 * then unlink it so that nothing else sees it by accident.
-	 */
-	*fdtype = EFD_FILE;
-	fd = mkostemp(template, O_CLOEXEC);
-	if (fd >= 0) {
-		if (unlink(template) >= 0)
-			return fd;
-		close(fd);
-	}
-
-error:
-	*fdtype = EFD_NONE;
-	return -1;
-}
-
-static int seal_execfd(int *fd, int fdtype)
-{
-	switch (fdtype) {
-	case EFD_MEMFD:{
-			/*
-			 * Try to seal with newer seals, but we ignore errors because older
-			 * kernels don't support some of them. For container security only
-			 * RUNC_MEMFD_MIN_SEALS are strictly required, but the rest are
-			 * nice-to-haves. We apply RUNC_MEMFD_MIN_SEALS at the end because it
-			 * contains F_SEAL_SEAL.
-			 */
-			int __attribute__((unused)) _err1 = fcntl(*fd, F_ADD_SEALS, F_SEAL_FUTURE_WRITE);	// Linux 5.1
-			int __attribute__((unused)) _err2 = fcntl(*fd, F_ADD_SEALS, F_SEAL_EXEC);	// Linux 6.3
-			return fcntl(*fd, F_ADD_SEALS, RUNC_MEMFD_MIN_SEALS);
-		}
-	case EFD_FILE:{
-			/* Need to re-open our pseudo-memfd as an O_PATH to avoid execve(2) giving -ETXTBSY. */
-			int newfd;
-			char fdpath[PATH_MAX] = { 0 };
-
-			if (fchmod(*fd, 0100) < 0)
-				return -1;
-
-			if (snprintf(fdpath, sizeof(fdpath), "/proc/self/fd/%d", *fd) < 0)
-				return -1;
-
-			newfd = open(fdpath, O_PATH | O_CLOEXEC);
-			if (newfd < 0)
-				return -1;
-
-			close(*fd);
-			*fd = newfd;
-			return 0;
-		}
-	default:
-		break;
-	}
-	return -1;
-}
-
-static ssize_t fd_to_fd(int outfd, int infd)
-{
-	ssize_t total = 0;
-	char buffer[4096];
-
-	for (;;) {
-		ssize_t nread, nwritten = 0;
-
-		nread = read(infd, buffer, sizeof(buffer));
-		if (nread < 0)
-			return -1;
-		if (!nread)
-			break;
-
-		do {
-			ssize_t n = write(outfd, buffer + nwritten, nread - nwritten);
-			if (n < 0)
-				return -1;
-			nwritten += n;
-		} while (nwritten < nread);
-
-		total += nwritten;
-	}
-
-	return total;
-}
-
-static int clone_binary(void)
-{
-	int binfd, execfd;
-	struct stat statbuf = { };
-	size_t sent = 0;
-	int fdtype = EFD_NONE;
-
-	execfd = make_execfd(&fdtype);
-	if (execfd < 0 || fdtype == EFD_NONE)
-		return -ENOTRECOVERABLE;
-
-	binfd = open("/proc/self/exe", O_RDONLY | O_CLOEXEC);
-	if (binfd < 0)
-		goto error;
-
-	if (fstat(binfd, &statbuf) < 0)
-		goto error_binfd;
-
-	while (sent < statbuf.st_size) {
-		int n = sendfile(execfd, binfd, NULL, statbuf.st_size - sent);
-		if (n < 0) {
-			/* sendfile can fail so we fallback to a dumb user-space copy. */
-			n = fd_to_fd(execfd, binfd);
-			if (n < 0)
-				goto error_binfd;
-		}
-		sent += n;
-	}
-	close(binfd);
-	if (sent != statbuf.st_size)
-		goto error;
-
-	if (seal_execfd(&execfd, fdtype) < 0)
-		goto error;
-
-	return execfd;
-
-error_binfd:
-	close(binfd);
-error:
-	close(execfd);
-	return -EIO;
-}
-
-/* Get cheap access to the environment. */
-extern char **environ;
-
-int ensure_cloned_binary(void)
-{
-	int execfd;
-	char **argv = NULL;
-
-	/* Check that we're not self-cloned, and if we are then bail. */
-	int cloned = is_self_cloned();
-	if (cloned > 0 || cloned == -ENOTRECOVERABLE)
-		return cloned;
-
-	if (fetchve(&argv) < 0)
-		return -EINVAL;
-
-	execfd = clone_binary();
-	if (execfd < 0)
-		return -EIO;
-
-	if (putenv(CLONED_BINARY_ENV "=1"))
-		goto error;
-
-	fexecve(execfd, argv, environ);
-error:
-	close(execfd);
-	return -ENOEXEC;
-}
diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c
index 22b6ea1cd21..45d5c291e4f 100644
--- a/libcontainer/nsenter/nsexec.c
+++ b/libcontainer/nsenter/nsexec.c
@@ -44,10 +44,6 @@ enum sync_t {
 	SYNC_RECVPID_ACK = 0x43,	/* PID was correctly received by parent. */
 	SYNC_GRANDCHILD = 0x44,	/* The grandchild is ready to run. */
 	SYNC_CHILD_FINISH = 0x45,	/* The child or grandchild has finished. */
-	SYNC_MOUNTSOURCES_PLS = 0x46,	/* Tell parent to send mount sources by SCM_RIGHTS. */
-	SYNC_MOUNTSOURCES_ACK = 0x47,	/* All mount sources have been sent. */
-	SYNC_MOUNT_IDMAP_PLS = 0x48,	/* Tell parent to mount idmap sources. */
-	SYNC_MOUNT_IDMAP_ACK = 0x49,	/* All idmap mounts have been done. */
 };
 
 #define STAGE_SETUP  -1
@@ -96,14 +92,6 @@ struct nlconfig_t {
 	size_t uidmappath_len;
 	char *gidmappath;
 	size_t gidmappath_len;
-
-	/* Mount sources opened outside the container userns. */
-	char *mountsources;
-	size_t mountsources_len;
-
-	/* Idmap sources opened outside the container userns which will be id mapped. */
-	char *idmapsources;
-	size_t idmapsources_len;
 };
 
 /*
@@ -120,8 +108,6 @@ struct nlconfig_t {
 #define ROOTLESS_EUID_ATTR	27287
 #define UIDMAPPATH_ATTR		27288
 #define GIDMAPPATH_ATTR		27289
-#define MOUNT_SOURCES_ATTR	27290
-#define IDMAP_SOURCES_ATTR	27291
 
 /*
  * Use the raw syscall for versions of glibc which don't include a function for
@@ -437,14 +423,6 @@ static void nl_parse(int fd, struct nlconfig_t *config)
 		case SETGROUP_ATTR:
 			config->is_setgroup = readint8(current);
 			break;
-		case MOUNT_SOURCES_ATTR:
-			config->mountsources = current;
-			config->mountsources_len = payload_len;
-			break;
-		case IDMAP_SOURCES_ATTR:
-			config->idmapsources = current;
-			config->idmapsources_len = payload_len;
-			break;
 		default:
 			bail("unknown netlink message type %d", nlattr->nla_type);
 		}
@@ -525,9 +503,6 @@ void join_namespaces(char *nslist)
 	free(namespaces);
 }
 
-/* Defined in cloned_binary.c. */
-extern int ensure_cloned_binary(void);
-
 static inline int sane_kill(pid_t pid, int signum)
 {
 	if (pid > 0)
@@ -536,115 +511,6 @@ static inline int sane_kill(pid_t pid, int signum)
 		return 0;
 }
 
-/* receive_fd_sources parses env_var as an array of fd numbers and, for each element that is
- * not -1, it receives an fd via SCM_RIGHTS and dup3 it to the fd requested in
- * the element of the env var.
- */
-void receive_fd_sources(int sockfd, const char *env_var)
-{
-	char *fds, *endp;
-	long new_fd;
-
-	// This env var must be a json array of ints.
-	fds = getenv(env_var);
-
-	if (fds[0] != '[') {
-		bail("malformed %s env var: missing '['", env_var);
-	}
-	fds++;
-
-	for (endp = fds; *endp != ']'; fds = endp + 1) {
-		new_fd = strtol(fds, &endp, 10);
-		if (endp == fds) {
-			bail("malformed %s env var: not a number", env_var);
-		}
-		if (*endp == '\0') {
-			bail("malformed %s env var: missing ]", env_var);
-		}
-		// The list contains -1 when no fd is needed. Ignore them.
-		if (new_fd == -1) {
-			continue;
-		}
-
-		if (new_fd == LONG_MAX || new_fd < 0 || new_fd > INT_MAX) {
-			bail("malformed %s env var: fds out of range", env_var);
-		}
-
-		int recv_fd = receive_fd(sockfd);
-		if (dup3(recv_fd, new_fd, O_CLOEXEC) < 0) {
-			bail("cannot dup3 fd %d to %ld", recv_fd, new_fd);
-		}
-		if (close(recv_fd) < 0) {
-			bail("cannot close fd %d", recv_fd);
-		}
-	}
-}
-
-void receive_mountsources(int sockfd)
-{
-	receive_fd_sources(sockfd, "_LIBCONTAINER_MOUNT_FDS");
-}
-
-void send_mountsources(int sockfd, pid_t child, char *mountsources, size_t mountsources_len)
-{
-	char proc_path[PATH_MAX];
-	int host_mntns_fd;
-	int container_mntns_fd;
-	int fd;
-	int ret;
-
-	// container_linux.go shouldSendMountSources() decides if mount sources
-	// should be pre-opened (O_PATH) and passed via SCM_RIGHTS
-	if (mountsources == NULL)
-		return;
-
-	host_mntns_fd = open("/proc/self/ns/mnt", O_RDONLY | O_CLOEXEC);
-	if (host_mntns_fd == -1)
-		bail("failed to get current mount namespace");
-
-	if (snprintf(proc_path, PATH_MAX, "/proc/%d/ns/mnt", child) < 0)
-		bail("failed to get mount namespace path");
-
-	container_mntns_fd = open(proc_path, O_RDONLY | O_CLOEXEC);
-	if (container_mntns_fd == -1)
-		bail("failed to get container mount namespace");
-
-	if (setns(container_mntns_fd, CLONE_NEWNS) < 0)
-		bail("failed to setns to container mntns");
-
-	char *mountsources_end = mountsources + mountsources_len;
-	while (mountsources < mountsources_end) {
-		if (mountsources[0] == '\0') {
-			mountsources++;
-			continue;
-		}
-
-		fd = open(mountsources, O_PATH | O_CLOEXEC);
-		if (fd < 0)
-			bail("failed to open mount source %s", mountsources);
-
-		write_log(DEBUG, "~> sending fd for: %s", mountsources);
-		if (send_fd(sockfd, fd) < 0)
-			bail("failed to send fd %d via unix socket %d", fd, sockfd);
-
-		ret = close(fd);
-		if (ret != 0)
-			bail("failed to close mount source fd %d", fd);
-
-		mountsources += strlen(mountsources) + 1;
-	}
-
-	if (setns(host_mntns_fd, CLONE_NEWNS) < 0)
-		bail("failed to setns to host mntns");
-
-	ret = close(host_mntns_fd);
-	if (ret != 0)
-		bail("failed to close host mount namespace fd %d", host_mntns_fd);
-	ret = close(container_mntns_fd);
-	if (ret != 0)
-		bail("failed to close container mount namespace fd %d", container_mntns_fd);
-}
-
 void try_unshare(int flags, const char *msg)
 {
 	write_log(DEBUG, "unshare %s", msg);
@@ -664,89 +530,6 @@ void try_unshare(int flags, const char *msg)
 	bail("failed to unshare %s", msg);
 }
 
-void send_idmapsources(int sockfd, pid_t pid, char *idmap_src, int idmap_src_len)
-{
-	char proc_user_path[PATH_MAX];
-
-	/* Open the userns fd only once.
-	 * Currently we only support idmap mounts that use the same mapping than
-	 * the userns. This is validated in libcontainer/configs/validate/validator.go,
-	 * so if we reached here, we know the mapping for the idmap is the same
-	 * as the userns. This is why we just open the userns_fd once from the
-	 * PID of the child process that has the userns already applied.
-	 */
-	int ret = snprintf(proc_user_path, sizeof(proc_user_path), "/proc/%d/ns/user", pid);
-	if (ret < 0 || (size_t)ret >= sizeof(proc_user_path)) {
-		sane_kill(pid, SIGKILL);
-		bail("failed to create userns path string");
-	}
-
-	int userns_fd = open(proc_user_path, O_RDONLY | O_CLOEXEC | O_NOCTTY);
-	if (userns_fd < 0) {
-		sane_kill(pid, SIGKILL);
-		bail("failed to get user namespace fd");
-	}
-
-	char *idmap_end = idmap_src + idmap_src_len;
-	while (idmap_src < idmap_end) {
-		if (idmap_src[0] == '\0') {
-			idmap_src++;
-			continue;
-		}
-
-		int fd_tree = sys_open_tree(-EBADF, idmap_src,
-					    OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC |
-					    AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT);
-		if (fd_tree < 0) {
-			sane_kill(pid, SIGKILL);
-			if (errno == ENOSYS) {
-				bail("open_tree(2) failed, the kernel doesn't support ID-mapped mounts");
-			} else if (errno == EINVAL) {
-				bail("open_tree(2) failed with path: %s, the kernel doesn't support ID-mapped mounts",
-				     idmap_src);
-			} else {
-				bail("open_tree(2) failed with path: %s", idmap_src);
-			}
-		}
-
-		struct mount_attr attr = {
-			.attr_set = MOUNT_ATTR_IDMAP,
-			.userns_fd = userns_fd,
-		};
-
-		ret = sys_mount_setattr(fd_tree, "", AT_EMPTY_PATH, &attr, sizeof(attr));
-		if (ret < 0) {
-			sane_kill(pid, SIGKILL);
-			if (errno == ENOSYS)
-				bail("mount_setattr(2) failed, the kernel doesn't support ID-mapped mounts");
-			else if (errno == EINVAL)
-				bail("mount_setattr(2) failed with path: %s, maybe the filesystem doesn't support ID-mapped mounts", idmap_src);
-			else
-				bail("mount_setattr(2) failed with path: %s", idmap_src);
-		}
-
-		write_log(DEBUG, "~> sending idmap source: %s with mapping from: %s", idmap_src, proc_user_path);
-		send_fd(sockfd, fd_tree);
-
-		if (close(fd_tree) < 0) {
-			sane_kill(pid, SIGKILL);
-			bail("error closing fd_tree");
-		}
-
-		idmap_src += strlen(idmap_src) + 1;
-	}
-
-	if (close(userns_fd) < 0) {
-		sane_kill(pid, SIGKILL);
-		bail("error closing userns fd");
-	}
-}
-
-void receive_idmapsources(int sockfd)
-{
-	receive_fd_sources(sockfd, "_LIBCONTAINER_IDMAP_FDS");
-}
-
 void nsexec(void)
 {
 	int pipenum;
@@ -771,14 +554,6 @@ void nsexec(void)
 		return;
 	}
 
-	/*
-	 * We need to re-exec if we are not in a cloned binary. This is necessary
-	 * to ensure that containers won't be able to access the host binary
-	 * through /proc/self/exe. See CVE-2019-5736.
-	 */
-	if (ensure_cloned_binary() < 0)
-		bail("could not ensure we are a cloned binary");
-
 	/*
 	 * Inform the parent we're past initial setup.
 	 * For the other side of this, see initWaiter.
@@ -977,28 +752,6 @@ void nsexec(void)
 						sane_kill(stage2_pid, SIGKILL);
 						bail("failed to sync with runc: write(pid-JSON)");
 					}
-					break;
-				case SYNC_MOUNTSOURCES_PLS:
-					write_log(DEBUG, "stage-1 requested to open mount sources");
-					send_mountsources(syncfd, stage1_pid, config.mountsources,
-							  config.mountsources_len);
-
-					s = SYNC_MOUNTSOURCES_ACK;
-					if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
-						sane_kill(stage1_pid, SIGKILL);
-						bail("failed to sync with child: write(SYNC_MOUNTSOURCES_ACK)");
-					}
-					break;
-				case SYNC_MOUNT_IDMAP_PLS:
-					write_log(DEBUG, "stage-1 requested to open idmap sources");
-					send_idmapsources(syncfd, stage1_pid, config.idmapsources,
-							  config.idmapsources_len);
-					s = SYNC_MOUNT_IDMAP_ACK;
-					if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
-						sane_kill(stage1_pid, SIGKILL);
-						bail("failed to sync with child: write(SYNC_MOUNT_IDMAP_ACK)");
-					}
-
 					break;
 				case SYNC_CHILD_FINISH:
 					write_log(DEBUG, "stage-1 complete");
@@ -1151,39 +904,7 @@ void nsexec(void)
 			 * some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID)
 			 * was broken, so we'll just do it the long way anyway.
 			 */
-			try_unshare(config.cloneflags & ~CLONE_NEWCGROUP, "remaining namespaces (except cgroupns)");
-
-			/* Ask our parent to send the mount sources fds. */
-			if (config.mountsources) {
-				write_log(DEBUG, "request stage-0 to send mount sources");
-				s = SYNC_MOUNTSOURCES_PLS;
-				if (write(syncfd, &s, sizeof(s)) != sizeof(s))
-					bail("failed to sync with parent: write(SYNC_MOUNTSOURCES_PLS)");
-
-				/* Receive and install all mount sources fds. */
-				receive_mountsources(syncfd);
-
-				/* Parent finished to send the mount sources fds. */
-				if (read(syncfd, &s, sizeof(s)) != sizeof(s))
-					bail("failed to sync with parent: read(SYNC_MOUNTSOURCES_ACK)");
-				if (s != SYNC_MOUNTSOURCES_ACK)
-					bail("failed to sync with parent: SYNC_MOUNTSOURCES_ACK: got %u", s);
-			}
-
-			if (config.idmapsources) {
-				write_log(DEBUG, "request stage-0 to send idmap sources");
-				s = SYNC_MOUNT_IDMAP_PLS;
-				if (write(syncfd, &s, sizeof(s)) != sizeof(s))
-					bail("failed to sync with parent: write(SYNC_MOUNT_IDMAP_PLS)");
-
-				/* Receive and install all idmap fds. */
-				receive_idmapsources(syncfd);
-
-				if (read(syncfd, &s, sizeof(s)) != sizeof(s))
-					bail("failed to sync with parent: read(SYNC_MOUNT_IDMAP_ACK)");
-				if (s != SYNC_MOUNT_IDMAP_ACK)
-					bail("failed to sync with parent: SYNC_MOUNT_IDMAP_ACK: got %u", s);
-			}
+			try_unshare(config.cloneflags, "remaining namespaces");
 
 			/*
 			 * TODO: What about non-namespace clone flags that we're dropping here?
@@ -1281,10 +1002,6 @@ void nsexec(void)
 					bail("setgroups failed");
 			}
 
-			if (config.cloneflags & CLONE_NEWCGROUP) {
-				try_unshare(CLONE_NEWCGROUP, "cgroup namespace");
-			}
-
 			write_log(DEBUG, "signal completion to stage-0");
 			s = SYNC_CHILD_FINISH;
 			if (write(syncfd, &s, sizeof(s)) != sizeof(s))
diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go
index 48861406dba..fb5cdc432b6 100644
--- a/libcontainer/process_linux.go
+++ b/libcontainer/process_linux.go
@@ -1,6 +1,7 @@
 package libcontainer
 
 import (
+	"context"
 	"encoding/json"
 	"errors"
 	"fmt"
@@ -9,19 +10,22 @@ import (
 	"os"
 	"os/exec"
 	"path/filepath"
+	"runtime"
 	"strconv"
 	"time"
 
+	"github.com/opencontainers/runtime-spec/specs-go"
+	"github.com/sirupsen/logrus"
+	"golang.org/x/sys/unix"
+
 	"github.com/opencontainers/runc/libcontainer/cgroups"
 	"github.com/opencontainers/runc/libcontainer/cgroups/fs2"
 	"github.com/opencontainers/runc/libcontainer/configs"
 	"github.com/opencontainers/runc/libcontainer/intelrdt"
 	"github.com/opencontainers/runc/libcontainer/logs"
 	"github.com/opencontainers/runc/libcontainer/system"
+	"github.com/opencontainers/runc/libcontainer/userns"
 	"github.com/opencontainers/runc/libcontainer/utils"
-	"github.com/opencontainers/runtime-spec/specs-go"
-	"github.com/sirupsen/logrus"
-	"golang.org/x/sys/unix"
 )
 
 type parentProcess interface {
@@ -167,18 +171,34 @@ func (p *setnsProcess) start() (retErr error) {
 			// This shouldn't happen.
 			panic("unexpected procReady in setns")
 		case procHooks:
-			// This shouldn't happen.
+			// this shouldn't happen.
 			panic("unexpected procHooks in setns")
+		case procMountPlease:
+			// this shouldn't happen.
+			panic("unexpected procMountPlease in setns")
 		case procSeccomp:
 			if p.config.Config.Seccomp.ListenerPath == "" {
-				return errors.New("listenerPath is not set")
+				return errors.New("seccomp listenerPath is not set")
 			}
-
-			seccompFd, err := recvSeccompFd(uintptr(p.pid()), uintptr(sync.Fd))
+			if sync.Arg == nil {
+				return fmt.Errorf("sync %q is missing an argument", sync.Type)
+			}
+			var srcFd int
+			if err := json.Unmarshal(*sync.Arg, &srcFd); err != nil {
+				return fmt.Errorf("sync %q passed invalid fd arg: %w", sync.Type, err)
+			}
+			seccompFd, err := pidGetFd(p.pid(), srcFd)
 			if err != nil {
+				return fmt.Errorf("sync %q get fd %d from child failed: %w", sync.Type, srcFd, err)
+			}
+			defer seccompFd.Close()
+			// We have a copy, the child can keep working. We don't need to
+			// wait for the seccomp notify listener to get the fd before we
+			// permit the child to continue because the child will happily wait
+			// for the listener if it hits SCMP_ACT_NOTIFY.
+			if err := writeSync(p.messageSockPair.parent, procSeccompDone); err != nil {
 				return err
 			}
-			defer unix.Close(seccompFd)
 
 			bundle, annotations := utils.Annotations(p.config.Config.Labels)
 			containerProcessState := &specs.ContainerProcessState{
@@ -199,15 +219,10 @@ func (p *setnsProcess) start() (retErr error) {
 				containerProcessState, seccompFd); err != nil {
 				return err
 			}
-
-			// Sync with child.
-			if err := writeSync(p.messageSockPair.parent, procSeccompDone); err != nil {
-				return err
-			}
-			return nil
 		default:
 			return errors.New("invalid JSON payload from child")
 		}
+		return nil
 	})
 
 	if err := unix.Shutdown(int(p.messageSockPair.parent.Fd()), unix.SHUT_WR); err != nil {
@@ -352,6 +367,95 @@ func (p *initProcess) waitForChildExit(childPid int) error {
 	return nil
 }
 
+type mountSourceRequestFn func(*configs.Mount) (mountSource, error)
+
+// goCreateMountSources spawns a goroutine which creates open_tree(2)-style
+// mountfds based on the requested configs.Mount configuration. The returned
+// requestFn and cancelFn are used to interact with the goroutine.
+func (p *initProcess) goCreateMountSources(ctx context.Context) (mountSourceRequestFn, context.CancelFunc, error) {
+	type response struct {
+		src mountSource
+		err error
+	}
+
+	errCh := make(chan error, 1)
+	requestCh := make(chan *configs.Mount)
+	responseCh := make(chan response)
+
+	ctx, cancelFn := context.WithCancel(ctx)
+	go func() {
+		// We lock this thread because we need to setns(2) here. There is no
+		// UnlockOSThread() here, to ensure that the Go runtime will kill this
+		// thread once this goroutine returns (ensuring no other goroutines run
+		// in this context).
+		runtime.LockOSThread()
+
+		// Detach from the shared fs of the rest of the Go process.
+		if err := unix.Unshare(unix.CLONE_FS); err != nil {
+			err = os.NewSyscallError("unshare(CLONE_FS)", err)
+			errCh <- fmt.Errorf("mount source thread: %w", err)
+		}
+
+		// Attach to the container's mount namespace.
+		nsFd, err := os.Open(fmt.Sprintf("/proc/%d/ns/mnt", p.pid()))
+		if err != nil {
+			errCh <- fmt.Errorf("mount source thread: open container mntns: %w", err)
+			return
+		}
+		defer nsFd.Close()
+		if err := unix.Setns(int(nsFd.Fd()), unix.CLONE_NEWNS); err != nil {
+			err = os.NewSyscallError("setns", err)
+			errCh <- fmt.Errorf("mount source thread: join container mntns: %w", err)
+			return
+		}
+
+		// No errors during setup!
+		errCh <- nil
+		logrus.Debugf("mount source thread: successfully running in container mntns")
+
+		nsHandles := new(userns.Handles)
+		defer nsHandles.Release()
+		for {
+			select {
+			case m := <-requestCh:
+				src, err := mountFd(nsHandles, m)
+				logrus.Debugf("mount source thread: handling request for %q: %v %v", m.Source, src, err)
+				responseCh <- response{
+					src: src,
+					err: err,
+				}
+			case <-ctx.Done():
+				close(requestCh)
+				close(responseCh)
+				return
+			}
+		}
+	}()
+
+	// Check for setup errors.
+	err := <-errCh
+	close(errCh)
+	if err != nil {
+		cancelFn()
+		return nil, nil, err
+	}
+
+	requestFn := func(m *configs.Mount) (mountSource, error) {
+		select {
+		case requestCh <- m:
+			select {
+			case resp := <-responseCh:
+				return resp.src, resp.err
+			case <-ctx.Done():
+				return mountSource{}, errors.New("receive mount source failed: channel closed")
+			}
+		case <-ctx.Done():
+			return mountSource{}, errors.New("send mount source request failed: channel closed")
+		}
+	}
+	return requestFn, cancelFn, nil
+}
+
 func (p *initProcess) start() (retErr error) {
 	defer p.messageSockPair.parent.Close() //nolint: errcheck
 	err := p.cmd.Start()
@@ -442,6 +546,17 @@ func (p *initProcess) start() (retErr error) {
 		return fmt.Errorf("error waiting for our first child to exit: %w", err)
 	}
 
+	// Spin up a goroutine to handle remapping mount requests by runc init. There is no point doing this for rootless containers becuase
+	var mountRequest mountSourceRequestFn
+	if !p.container.config.RootlessEUID {
+		request, cancel, err := p.goCreateMountSources(context.Background())
+		if err != nil {
+			return fmt.Errorf("error spawning mount remapping thread: %w", err)
+		}
+		defer cancel()
+		mountRequest = request
+	}
+
 	if err := p.createNetworkInterfaces(); err != nil {
 		return fmt.Errorf("error creating network interfaces: %w", err)
 	}
@@ -458,16 +573,58 @@ func (p *initProcess) start() (retErr error) {
 
 	ierr := parseSync(p.messageSockPair.parent, func(sync *syncT) error {
 		switch sync.Type {
+		case procMountPlease:
+			if mountRequest == nil {
+				return fmt.Errorf("cannot fulfil mount requests as a rootless user")
+			}
+			var m *configs.Mount
+			if sync.Arg == nil {
+				return fmt.Errorf("sync %q is missing an argument", sync.Type)
+			}
+			if err := json.Unmarshal(*sync.Arg, &m); err != nil {
+				return fmt.Errorf("sync %q passed invalid mount arg: %w", sync.Type, err)
+			}
+			mnt, err := mountRequest(m)
+			if err != nil {
+				return fmt.Errorf("failed to fulfil mount request: %w", err)
+			}
+			defer mnt.file.Close()
+
+			arg, err := json.Marshal(mnt)
+			if err != nil {
+				return fmt.Errorf("sync %q failed to marshal mountSource: %w", sync.Type, err)
+			}
+			argMsg := json.RawMessage(arg)
+			if err := doWriteSync(p.messageSockPair.parent, syncT{
+				Type: procMountFd,
+				Arg:  &argMsg,
+				File: mnt.file,
+			}); err != nil {
+				return err
+			}
 		case procSeccomp:
 			if p.config.Config.Seccomp.ListenerPath == "" {
-				return errors.New("listenerPath is not set")
+				return errors.New("seccomp listenerPath is not set")
 			}
-
-			seccompFd, err := recvSeccompFd(uintptr(childPid), uintptr(sync.Fd))
+			var srcFd int
+			if sync.Arg == nil {
+				return fmt.Errorf("sync %q is missing an argument", sync.Type)
+			}
+			if err := json.Unmarshal(*sync.Arg, &srcFd); err != nil {
+				return fmt.Errorf("sync %q passed invalid fd arg: %w", sync.Type, err)
+			}
+			seccompFd, err := pidGetFd(p.pid(), srcFd)
 			if err != nil {
+				return fmt.Errorf("sync %q get fd %d from child failed: %w", sync.Type, srcFd, err)
+			}
+			defer seccompFd.Close()
+			// We have a copy, the child can keep working. We don't need to
+			// wait for the seccomp notify listener to get the fd before we
+			// permit the child to continue because the child will happily wait
+			// for the listener if it hits SCMP_ACT_NOTIFY.
+			if err := writeSync(p.messageSockPair.parent, procSeccompDone); err != nil {
 				return err
 			}
-			defer unix.Close(seccompFd)
 
 			s, err := p.container.currentOCIState()
 			if err != nil {
@@ -488,11 +645,6 @@ func (p *initProcess) start() (retErr error) {
 				containerProcessState, seccompFd); err != nil {
 				return err
 			}
-
-			// Sync with child.
-			if err := writeSync(p.messageSockPair.parent, procSeccompDone); err != nil {
-				return err
-			}
 		case procReady:
 			// set rlimits, this has to be done here because we lose permissions
 			// to raise the limits once we enter a user-namespace
@@ -591,7 +743,6 @@ func (p *initProcess) start() (retErr error) {
 		default:
 			return errors.New("invalid JSON payload from child")
 		}
-
 		return nil
 	})
 
@@ -684,22 +835,20 @@ func (p *initProcess) forwardChildLogs() chan error {
 	return logs.ForwardLogs(p.logFilePair.parent)
 }
 
-func recvSeccompFd(childPid, childFd uintptr) (int, error) {
-	pidfd, _, errno := unix.Syscall(unix.SYS_PIDFD_OPEN, childPid, 0, 0)
-	if errno != 0 {
-		return -1, fmt.Errorf("performing SYS_PIDFD_OPEN syscall: %w", errno)
+func pidGetFd(pid, srcFd int) (*os.File, error) {
+	pidFd, err := unix.PidfdOpen(pid, 0)
+	if err != nil {
+		return nil, os.NewSyscallError("pidfd_open", err)
 	}
-	defer unix.Close(int(pidfd))
-
-	seccompFd, _, errno := unix.Syscall(unix.SYS_PIDFD_GETFD, pidfd, childFd, 0)
-	if errno != 0 {
-		return -1, fmt.Errorf("performing SYS_PIDFD_GETFD syscall: %w", errno)
+	defer unix.Close(pidFd)
+	fd, err := unix.PidfdGetfd(pidFd, srcFd, 0)
+	if err != nil {
+		return nil, os.NewSyscallError("pidfd_getfd", err)
 	}
-
-	return int(seccompFd), nil
+	return os.NewFile(uintptr(fd), "[pidfd_getfd]"), nil
 }
 
-func sendContainerProcessState(listenerPath string, state *specs.ContainerProcessState, fd int) error {
+func sendContainerProcessState(listenerPath string, state *specs.ContainerProcessState, file *os.File) error {
 	conn, err := net.Dial("unix", listenerPath)
 	if err != nil {
 		return fmt.Errorf("failed to connect with seccomp agent specified in the seccomp profile: %w", err)
@@ -716,11 +865,10 @@ func sendContainerProcessState(listenerPath string, state *specs.ContainerProces
 		return fmt.Errorf("cannot marshall seccomp state: %w", err)
 	}
 
-	err = utils.SendFds(socket, b, fd)
-	if err != nil {
+	if err := utils.SendRawFd(socket, string(b), file.Fd()); err != nil {
 		return fmt.Errorf("cannot send seccomp fd to %s: %w", listenerPath, err)
 	}
-
+	runtime.KeepAlive(file)
 	return nil
 }
 
diff --git a/libcontainer/rootfs_linux.go b/libcontainer/rootfs_linux.go
index ac3ba183e12..fb7e917832c 100644
--- a/libcontainer/rootfs_linux.go
+++ b/libcontainer/rootfs_linux.go
@@ -1,9 +1,9 @@
 package libcontainer
 
 import (
+	"encoding/json"
 	"errors"
 	"fmt"
-	"io"
 	"os"
 	"path"
 	"path/filepath"
@@ -14,16 +14,17 @@ import (
 	securejoin "github.com/cyphar/filepath-securejoin"
 	"github.com/moby/sys/mountinfo"
 	"github.com/mrunalp/fileutils"
+	"github.com/opencontainers/runtime-spec/specs-go"
+	"github.com/opencontainers/selinux/go-selinux/label"
+	"github.com/sirupsen/logrus"
+	"golang.org/x/sys/unix"
+
 	"github.com/opencontainers/runc/libcontainer/cgroups"
 	"github.com/opencontainers/runc/libcontainer/cgroups/fs2"
 	"github.com/opencontainers/runc/libcontainer/configs"
 	"github.com/opencontainers/runc/libcontainer/devices"
 	"github.com/opencontainers/runc/libcontainer/userns"
 	"github.com/opencontainers/runc/libcontainer/utils"
-	"github.com/opencontainers/runtime-spec/specs-go"
-	"github.com/opencontainers/selinux/go-selinux/label"
-	"github.com/sirupsen/logrus"
-	"golang.org/x/sys/unix"
 )
 
 const defaultMountFlags = unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV
@@ -41,12 +42,12 @@ type mountConfig struct {
 // mountEntry contains mount data specific to a mount point.
 type mountEntry struct {
 	*configs.Mount
-	srcFD *int
+	srcFile *mountSource
 }
 
 func (m *mountEntry) src() string {
-	if m.srcFD != nil {
-		return "/proc/self/fd/" + strconv.Itoa(*m.srcFD)
+	if m.srcFile != nil {
+		return "/proc/self/fd/" + strconv.Itoa(int(m.srcFile.file.Fd()))
 	}
 	return m.Source
 }
@@ -64,20 +65,12 @@ func needsSetupDev(config *configs.Config) bool {
 // prepareRootfs sets up the devices, mount points, and filesystems for use
 // inside a new mount namespace. It doesn't set anything as ro. You must call
 // finalizeRootfs after this function to finish setting up the rootfs.
-func prepareRootfs(pipe io.ReadWriter, iConfig *initConfig, mountFds mountFds) (err error) {
+func prepareRootfs(pipe *os.File, iConfig *initConfig) (err error) {
 	config := iConfig.Config
 	if err := prepareRoot(config); err != nil {
 		return fmt.Errorf("error preparing rootfs: %w", err)
 	}
 
-	if mountFds.sourceFds != nil && len(mountFds.sourceFds) != len(config.Mounts) {
-		return fmt.Errorf("malformed mountFds slice. Expected size: %v, got: %v", len(config.Mounts), len(mountFds.sourceFds))
-	}
-
-	if mountFds.idmapFds != nil && len(mountFds.idmapFds) != len(config.Mounts) {
-		return fmt.Errorf("malformed idmapFds slice: expected size: %v, got: %v", len(config.Mounts), len(mountFds.idmapFds))
-	}
-
 	mountConfig := &mountConfig{
 		root:            config.Rootfs,
 		label:           config.MountLabel,
@@ -86,22 +79,48 @@ func prepareRootfs(pipe io.ReadWriter, iConfig *initConfig, mountFds mountFds) (
 		cgroupns:        config.Namespaces.Contains(configs.NEWCGROUP),
 		noMountFallback: config.NoMountFallback,
 	}
-	for i, m := range config.Mounts {
+	for _, m := range config.Mounts {
 		entry := mountEntry{Mount: m}
-		// Just before the loop we checked that if not empty, len(mountFds.sourceFds) == len(config.Mounts).
-		// Therefore, we can access mountFds.sourceFds[i] without any concerns.
-		if mountFds.sourceFds != nil && mountFds.sourceFds[i] != -1 {
-			entry.srcFD = &mountFds.sourceFds[i]
+		// Figure out whether we need to request runc to give us an
+		// open_tree(2)-style mountfd. For idmapped mounts, this is always
+		// necessary. For bind-mounts, this is only necessary if we cannot
+		// resolve the parent mount (this is only hit if you are running in a
+		// userns).
+		wantSourceFile := m.IsIDMapped()
+		if m.IsBind() {
+			if _, err := os.Stat(m.Source); err != nil {
+				wantSourceFile = true
+			}
 		}
-
-		// We validated before we can access mountFds.idmapFds[i].
-		if mountFds.idmapFds != nil && mountFds.idmapFds[i] != -1 {
-			if entry.srcFD != nil {
-				return fmt.Errorf("malformed mountFds and idmapFds slice, entry: %v has fds in both slices", i)
+		if wantSourceFile {
+			// Request a source file from the host.
+			if err := writeSyncArg(pipe, procMountPlease, m); err != nil {
+				return fmt.Errorf("failed to request mountfd for %q: %w", m.Source, err)
+			}
+			sync, err := readSyncFull(pipe, procMountFd)
+			if err != nil {
+				return fmt.Errorf("mountfd request for %q failed: %w", m.Source, err)
+			}
+			if sync.File == nil {
+				return fmt.Errorf("mountfd request for %q: response missing attached fd", m.Source)
+			}
+			defer sync.File.Close()
+			// Sanity-check to make sure we didn't get the wrong fd back. This
+			// should never happen.
+			if sync.File.Name() != m.Source {
+				return fmt.Errorf("returned mountfd for %q doesn't match requested mount configuration: mountfd path is %q", m.Source, sync.File.Name())
+			}
+			// Unmarshal the procMountFd argument (the file is sync.File).
+			var src mountSource
+			if sync.Arg == nil {
+				return fmt.Errorf("sync %q is missing an argument", sync.Type)
+			}
+			if err := json.Unmarshal(*sync.Arg, &src); err != nil {
+				return fmt.Errorf("invalid mount fd response argument %q: %w", string(*sync.Arg), err)
 			}
-			entry.srcFD = &mountFds.idmapFds[i]
+			src.file = sync.File
+			entry.srcFile = &src
 		}
-
 		if err := mountToRootfs(mountConfig, entry); err != nil {
 			return fmt.Errorf("error mounting %q to rootfs at %q: %w", m.Source, m.Destination, err)
 		}
@@ -284,7 +303,7 @@ func mountCgroupV1(m *configs.Mount, c *mountConfig) error {
 			if err := os.MkdirAll(subsystemPath, 0o755); err != nil {
 				return err
 			}
-			if err := utils.WithProcfd(c.root, b.Destination, func(dstFD string) error {
+			if err := utils.WithProcfd(c.root, b.Destination, func(dstFd string) error {
 				flags := defaultMountFlags
 				if m.Flags&unix.MS_RDONLY != 0 {
 					flags = flags | unix.MS_RDONLY
@@ -297,7 +316,7 @@ func mountCgroupV1(m *configs.Mount, c *mountConfig) error {
 					data = cgroups.CgroupNamePrefix + data
 					source = "systemd"
 				}
-				return mountViaFDs(source, nil, b.Destination, dstFD, "cgroup", uintptr(flags), data)
+				return mountViaFds(source, nil, b.Destination, dstFd, "cgroup", uintptr(flags), data)
 			}); err != nil {
 				return err
 			}
@@ -328,8 +347,8 @@ func mountCgroupV2(m *configs.Mount, c *mountConfig) error {
 	if err := os.MkdirAll(dest, 0o755); err != nil {
 		return err
 	}
-	err = utils.WithProcfd(c.root, m.Destination, func(dstFD string) error {
-		return mountViaFDs(m.Source, nil, m.Destination, dstFD, "cgroup2", uintptr(m.Flags), m.Data)
+	err = utils.WithProcfd(c.root, m.Destination, func(dstFd string) error {
+		return mountViaFds(m.Source, nil, m.Destination, dstFd, "cgroup2", uintptr(m.Flags), m.Data)
 	})
 	if err == nil || !(errors.Is(err, unix.EPERM) || errors.Is(err, unix.EBUSY)) {
 		return err
@@ -350,7 +369,6 @@ func mountCgroupV2(m *configs.Mount, c *mountConfig) error {
 		bindM.Source = c.cgroup2Path
 	}
 	// mountToRootfs() handles remounting for MS_RDONLY.
-	// No need to set mountEntry.srcFD here, because mountToRootfs() calls utils.WithProcfd() by itself in mountPropagate().
 	err = mountToRootfs(c, mountEntry{Mount: bindM})
 	if c.rootlessCgroups && errors.Is(err, unix.ENOENT) {
 		// ENOENT (for `src = c.cgroup2Path`) happens when rootless runc is being executed
@@ -395,15 +413,15 @@ func doTmpfsCopyUp(m mountEntry, rootfs, mountLabel string) (Err error) {
 		}
 	}()
 
-	return utils.WithProcfd(rootfs, m.Destination, func(dstFD string) (Err error) {
+	return utils.WithProcfd(rootfs, m.Destination, func(dstFd string) (Err error) {
 		// Copy the container data to the host tmpdir. We append "/" to force
 		// CopyDirectory to resolve the symlink rather than trying to copy the
 		// symlink itself.
-		if err := fileutils.CopyDirectory(dstFD+"/", tmpDir); err != nil {
-			return fmt.Errorf("tmpcopyup: failed to copy %s to %s (%s): %w", m.Destination, dstFD, tmpDir, err)
+		if err := fileutils.CopyDirectory(dstFd+"/", tmpDir); err != nil {
+			return fmt.Errorf("tmpcopyup: failed to copy %s to %s (%s): %w", m.Destination, dstFd, tmpDir, err)
 		}
 		// Now move the mount into the container.
-		if err := mountViaFDs(tmpDir, nil, m.Destination, dstFD, "", unix.MS_MOVE, ""); err != nil {
+		if err := mountViaFds(tmpDir, nil, m.Destination, dstFd, "", unix.MS_MOVE, ""); err != nil {
 			return fmt.Errorf("tmpcopyup: failed to move mount: %w", err)
 		}
 		return nil
@@ -480,35 +498,9 @@ func mountToRootfs(c *mountConfig, m mountEntry) error {
 		if err := prepareBindMount(m, rootfs); err != nil {
 			return err
 		}
-
-		if m.IsBind() && m.IsIDMapped() {
-			if m.srcFD == nil {
-				return fmt.Errorf("error creating mount %+v: idmapFD is invalid, should point to a valid fd", m)
-			}
-			if err := unix.MoveMount(*m.srcFD, "", -1, dest, unix.MOVE_MOUNT_F_EMPTY_PATH); err != nil {
-				return fmt.Errorf("error on unix.MoveMount %+v: %w", m, err)
-			}
-
-			// In nsexec.c, we did not set the propagation field of mount_attr struct.
-			// So, let's deal with these flags right now!
-			if err := utils.WithProcfd(rootfs, dest, func(dstFD string) error {
-				for _, pflag := range m.PropagationFlags {
-					// When using mount for setting propagations flags, the source, file
-					// system type and data arguments are ignored:
-					// https://man7.org/linux/man-pages/man2/mount.2.html
-					// We also ignore procfd because we want to act on dest.
-					if err := mountViaFDs("", nil, dest, dstFD, "", uintptr(pflag), ""); err != nil {
-						return err
-					}
-				}
-				return nil
-			}); err != nil {
-				return fmt.Errorf("change mount propagation through procfd: %w", err)
-			}
-		} else {
-			if err := mountPropagate(m, rootfs, mountLabel); err != nil {
-				return err
-			}
+		// open_tree()-related shenanigans are all handled in mountViaFds.
+		if err := mountPropagate(m, rootfs, mountLabel); err != nil {
+			return err
 		}
 		// bind mount won't change mount options, we need remount to make mount options effective.
 		// first check that we have non-default options required before attempting a remount
@@ -732,8 +724,8 @@ func bindMountDeviceNode(rootfs, dest string, node *devices.Device) error {
 	if f != nil {
 		_ = f.Close()
 	}
-	return utils.WithProcfd(rootfs, dest, func(dstFD string) error {
-		return mountViaFDs(node.Path, nil, dest, dstFD, "bind", unix.MS_BIND, "")
+	return utils.WithProcfd(rootfs, dest, func(dstFd string) error {
+		return mountViaFds(node.Path, nil, dest, dstFd, "bind", unix.MS_BIND, "")
 	})
 }
 
@@ -1104,9 +1096,9 @@ func writeSystemProperty(key, value string) error {
 }
 
 func remount(m mountEntry, rootfs string, noMountFallback bool) error {
-	return utils.WithProcfd(rootfs, m.Destination, func(dstFD string) error {
+	return utils.WithProcfd(rootfs, m.Destination, func(dstFd string) error {
 		flags := uintptr(m.Flags | unix.MS_REMOUNT)
-		err := mountViaFDs(m.Source, m.srcFD, m.Destination, dstFD, m.Device, flags, "")
+		err := mountViaFds("", nil, m.Destination, dstFd, m.Device, flags, "")
 		if err == nil {
 			return nil
 		}
@@ -1130,7 +1122,7 @@ func remount(m mountEntry, rootfs string, noMountFallback bool) error {
 		}
 		// ... and retry the mount with flags found above.
 		flags |= uintptr(int(s.Flags) & checkflags)
-		return mountViaFDs(m.Source, m.srcFD, m.Destination, dstFD, m.Device, flags, "")
+		return mountViaFds("", nil, m.Destination, dstFd, m.Device, flags, "")
 	})
 }
 
@@ -1153,17 +1145,17 @@ func mountPropagate(m mountEntry, rootfs string, mountLabel string) error {
 	// mutating underneath us, we verify that we are actually going to mount
 	// inside the container with WithProcfd() -- mounting through a procfd
 	// mounts on the target.
-	if err := utils.WithProcfd(rootfs, m.Destination, func(dstFD string) error {
-		return mountViaFDs(m.Source, m.srcFD, m.Destination, dstFD, m.Device, uintptr(flags), data)
+	if err := utils.WithProcfd(rootfs, m.Destination, func(dstFd string) error {
+		return mountViaFds(m.Source, m.srcFile, m.Destination, dstFd, m.Device, uintptr(flags), data)
 	}); err != nil {
 		return err
 	}
 	// We have to apply mount propagation flags in a separate WithProcfd() call
 	// because the previous call invalidates the passed procfd -- the mount
 	// target needs to be re-opened.
-	if err := utils.WithProcfd(rootfs, m.Destination, func(dstFD string) error {
+	if err := utils.WithProcfd(rootfs, m.Destination, func(dstFd string) error {
 		for _, pflag := range m.PropagationFlags {
-			if err := mountViaFDs("", nil, m.Destination, dstFD, "", uintptr(pflag), ""); err != nil {
+			if err := mountViaFds("", nil, m.Destination, dstFd, "", uintptr(pflag), ""); err != nil {
 				return err
 			}
 		}
diff --git a/libcontainer/seccomp/patchbpf/enosys_linux.go b/libcontainer/seccomp/patchbpf/enosys_linux.go
index 7fc9fd662c3..40e51533410 100644
--- a/libcontainer/seccomp/patchbpf/enosys_linux.go
+++ b/libcontainer/seccomp/patchbpf/enosys_linux.go
@@ -690,17 +690,17 @@ func sysSeccompSetFilter(flags uint, filter []unix.SockFilter) (fd int, err erro
 // patches said filter to handle -ENOSYS in a much nicer manner than the
 // default libseccomp default action behaviour, and loads the patched filter
 // into the kernel for the current process.
-func PatchAndLoad(config *configs.Seccomp, filter *libseccomp.ScmpFilter) (int, error) {
+func PatchAndLoad(config *configs.Seccomp, filter *libseccomp.ScmpFilter) (*os.File, error) {
 	// Generate a patched filter.
 	fprog, err := enosysPatchFilter(config, filter)
 	if err != nil {
-		return -1, fmt.Errorf("error patching filter: %w", err)
+		return nil, fmt.Errorf("error patching filter: %w", err)
 	}
 
 	// Get the set of libseccomp flags set.
 	seccompFlags, noNewPrivs, err := filterFlags(config, filter)
 	if err != nil {
-		return -1, fmt.Errorf("unable to fetch seccomp filter flags: %w", err)
+		return nil, fmt.Errorf("unable to fetch seccomp filter flags: %w", err)
 	}
 
 	// Set no_new_privs if it was requested, though in runc we handle
@@ -708,15 +708,14 @@ func PatchAndLoad(config *configs.Seccomp, filter *libseccomp.ScmpFilter) (int,
 	if noNewPrivs {
 		logrus.Warnf("potentially misconfigured filter -- setting no_new_privs in seccomp path")
 		if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
-			return -1, fmt.Errorf("error enabling no_new_privs bit: %w", err)
+			return nil, fmt.Errorf("error enabling no_new_privs bit: %w", err)
 		}
 	}
 
 	// Finally, load the filter.
 	fd, err := sysSeccompSetFilter(seccompFlags, fprog)
 	if err != nil {
-		return -1, fmt.Errorf("error loading seccomp filter: %w", err)
+		return nil, fmt.Errorf("error loading seccomp filter: %w", err)
 	}
-
-	return fd, nil
+	return os.NewFile(uintptr(fd), "[seccomp filter]"), nil
 }
diff --git a/libcontainer/seccomp/seccomp_linux.go b/libcontainer/seccomp/seccomp_linux.go
index fed02bcedc4..2c64ebbbadd 100644
--- a/libcontainer/seccomp/seccomp_linux.go
+++ b/libcontainer/seccomp/seccomp_linux.go
@@ -6,6 +6,7 @@ package seccomp
 import (
 	"errors"
 	"fmt"
+	"os"
 
 	libseccomp "github.com/seccomp/libseccomp-golang"
 	"github.com/sirupsen/logrus"
@@ -27,17 +28,16 @@ const (
 )
 
 // InitSeccomp installs the seccomp filters to be used in the container as
-// specified in config.
-// Returns the seccomp file descriptor if any of the filters include a
-// SCMP_ACT_NOTIFY action, otherwise returns -1.
-func InitSeccomp(config *configs.Seccomp) (int, error) {
+// specified in config. Returns the seccomp file descriptor if any of the
+// filters include a SCMP_ACT_NOTIFY action.
+func InitSeccomp(config *configs.Seccomp) (*os.File, error) {
 	if config == nil {
-		return -1, errors.New("cannot initialize Seccomp - nil config passed")
+		return nil, errors.New("cannot initialize Seccomp - nil config passed")
 	}
 
 	defaultAction, err := getAction(config.DefaultAction, config.DefaultErrnoRet)
 	if err != nil {
-		return -1, errors.New("error initializing seccomp - invalid default action")
+		return nil, errors.New("error initializing seccomp - invalid default action")
 	}
 
 	// Ignore the error since pre-2.4 libseccomp is treated as API level 0.
@@ -45,7 +45,7 @@ func InitSeccomp(config *configs.Seccomp) (int, error) {
 	for _, call := range config.Syscalls {
 		if call.Action == configs.Notify {
 			if apiLevel < 6 {
-				return -1, fmt.Errorf("seccomp notify unsupported: API level: got %d, want at least 6. Please try with libseccomp >= 2.5.0 and Linux >= 5.7", apiLevel)
+				return nil, fmt.Errorf("seccomp notify unsupported: API level: got %d, want at least 6. Please try with libseccomp >= 2.5.0 and Linux >= 5.7", apiLevel)
 			}
 
 			// We can't allow the write syscall to notify to the seccomp agent.
@@ -61,36 +61,36 @@ func InitSeccomp(config *configs.Seccomp) (int, error) {
 			// agent allows those syscalls to proceed, initialization works just fine and the agent can
 			// handle future read()/close() syscalls as it wanted.
 			if call.Name == "write" {
-				return -1, errors.New("SCMP_ACT_NOTIFY cannot be used for the write syscall")
+				return nil, errors.New("SCMP_ACT_NOTIFY cannot be used for the write syscall")
 			}
 		}
 	}
 
 	// See comment on why write is not allowed. The same reason applies, as this can mean handling write too.
 	if defaultAction == libseccomp.ActNotify {
-		return -1, errors.New("SCMP_ACT_NOTIFY cannot be used as default action")
+		return nil, errors.New("SCMP_ACT_NOTIFY cannot be used as default action")
 	}
 
 	filter, err := libseccomp.NewFilter(defaultAction)
 	if err != nil {
-		return -1, fmt.Errorf("error creating filter: %w", err)
+		return nil, fmt.Errorf("error creating filter: %w", err)
 	}
 
 	// Add extra architectures
 	for _, arch := range config.Architectures {
 		scmpArch, err := libseccomp.GetArchFromString(arch)
 		if err != nil {
-			return -1, fmt.Errorf("error validating Seccomp architecture: %w", err)
+			return nil, fmt.Errorf("error validating Seccomp architecture: %w", err)
 		}
 		if err := filter.AddArch(scmpArch); err != nil {
-			return -1, fmt.Errorf("error adding architecture to seccomp filter: %w", err)
+			return nil, fmt.Errorf("error adding architecture to seccomp filter: %w", err)
 		}
 	}
 
 	// Add extra flags.
 	for _, flag := range config.Flags {
 		if err := setFlag(filter, flag); err != nil {
-			return -1, err
+			return nil, err
 		}
 	}
 
@@ -110,25 +110,24 @@ func InitSeccomp(config *configs.Seccomp) (int, error) {
 
 	// Unset no new privs bit
 	if err := filter.SetNoNewPrivsBit(false); err != nil {
-		return -1, fmt.Errorf("error setting no new privileges: %w", err)
+		return nil, fmt.Errorf("error setting no new privileges: %w", err)
 	}
 
 	// Add a rule for each syscall
 	for _, call := range config.Syscalls {
 		if call == nil {
-			return -1, errors.New("encountered nil syscall while initializing Seccomp")
+			return nil, errors.New("encountered nil syscall while initializing Seccomp")
 		}
 
 		if err := matchCall(filter, call, defaultAction); err != nil {
-			return -1, err
+			return nil, err
 		}
 	}
 
 	seccompFd, err := patchbpf.PatchAndLoad(config, filter)
 	if err != nil {
-		return -1, fmt.Errorf("error loading seccomp filter into kernel: %w", err)
+		return nil, fmt.Errorf("error loading seccomp filter into kernel: %w", err)
 	}
-
 	return seccompFd, nil
 }
 
diff --git a/libcontainer/seccomp/seccomp_unsupported.go b/libcontainer/seccomp/seccomp_unsupported.go
index 885529dc7d0..b08a3498687 100644
--- a/libcontainer/seccomp/seccomp_unsupported.go
+++ b/libcontainer/seccomp/seccomp_unsupported.go
@@ -5,6 +5,7 @@ package seccomp
 
 import (
 	"errors"
+	"os"
 
 	"github.com/opencontainers/runc/libcontainer/configs"
 	"github.com/opencontainers/runtime-spec/specs-go"
@@ -13,11 +14,11 @@ import (
 var ErrSeccompNotEnabled = errors.New("seccomp: config provided but seccomp not supported")
 
 // InitSeccomp does nothing because seccomp is not supported.
-func InitSeccomp(config *configs.Seccomp) (int, error) {
+func InitSeccomp(config *configs.Seccomp) (*os.File, error) {
 	if config != nil {
-		return -1, ErrSeccompNotEnabled
+		return nil, ErrSeccompNotEnabled
 	}
-	return -1, nil
+	return nil, nil
 }
 
 // FlagSupported tells if a provided seccomp flag is supported.
diff --git a/libcontainer/setns_init_linux.go b/libcontainer/setns_init_linux.go
index ac58190758a..40a47a2e95c 100644
--- a/libcontainer/setns_init_linux.go
+++ b/libcontainer/setns_init_linux.go
@@ -75,7 +75,6 @@ func (l *linuxSetnsInit) Init() error {
 		if err != nil {
 			return err
 		}
-
 		if err := syncParentSeccomp(l.pipe, seccompFd); err != nil {
 			return err
 		}
@@ -94,7 +93,6 @@ func (l *linuxSetnsInit) Init() error {
 		if err != nil {
 			return fmt.Errorf("unable to init seccomp: %w", err)
 		}
-
 		if err := syncParentSeccomp(l.pipe, seccompFd); err != nil {
 			return err
 		}
diff --git a/libcontainer/standard_init_linux.go b/libcontainer/standard_init_linux.go
index f3d04282362..3d97b06d1b5 100644
--- a/libcontainer/standard_init_linux.go
+++ b/libcontainer/standard_init_linux.go
@@ -25,7 +25,6 @@ type linuxStandardInit struct {
 	parentPid     int
 	fifoFd        int
 	logFd         int
-	mountFds      mountFds
 	config        *initConfig
 }
 
@@ -86,17 +85,7 @@ func (l *linuxStandardInit) Init() error {
 	// initialises the labeling system
 	selinux.GetEnabled()
 
-	// We don't need the mount nor idmap fds after prepareRootfs() nor if it fails.
-	err := prepareRootfs(l.pipe, l.config, l.mountFds)
-	for _, m := range append(l.mountFds.sourceFds, l.mountFds.idmapFds...) {
-		if m == -1 {
-			continue
-		}
-		if err := unix.Close(m); err != nil {
-			return fmt.Errorf("unable to close mountFds fds: %w", err)
-		}
-	}
-
+	err := prepareRootfs(l.pipe, l.config)
 	if err != nil {
 		return err
 	}
diff --git a/libcontainer/sync.go b/libcontainer/sync.go
index 25dc2863071..2cb659ff94e 100644
--- a/libcontainer/sync.go
+++ b/libcontainer/sync.go
@@ -5,6 +5,7 @@ import (
 	"errors"
 	"fmt"
 	"io"
+	"os"
 
 	"github.com/opencontainers/runc/libcontainer/utils"
 )
@@ -15,15 +16,24 @@ type syncType string
 // during container setup. They come in pairs (with procError being a generic
 // response which is followed by an &initError).
 //
-//	[  child  ] <-> [   parent   ]
+//	     [  child  ] <-> [   parent   ]
 //
-//	procHooks   --> [run hooks]
-//	            <-- procResume
+//	procMountPlease      --> [open_tree(2) and configure mount]
+//	  Arg: configs.Mount
+//	                     <-- procMountFd
+//	                           file: mountfd
 //
-//	procReady   --> [final setup]
-//	            <-- procRun
+//	procSeccomp         --> [forward fd to listenerPath]
+//	  file: seccomp fd
+//	                    --- no return synchronisation
 //
-//	procSeccomp --> [pick up seccomp fd with pidfd_getfd()]
+//	procHooks --> [run hooks]
+//	          <-- procResume
+//
+//	procReady --> [final setup]
+//	          <-- procRun
+//
+//	procSeccomp --> [grab seccomp fd with pidfd_getfd()]
 //	            <-- procSeccompDone
 const (
 	procError       syncType = "procError"
@@ -31,13 +41,23 @@ const (
 	procRun         syncType = "procRun"
 	procHooks       syncType = "procHooks"
 	procResume      syncType = "procResume"
+	procMountPlease syncType = "procMountPlease"
+	procMountFd     syncType = "procMountFd"
 	procSeccomp     syncType = "procSeccomp"
 	procSeccompDone syncType = "procSeccompDone"
 )
 
+type syncFlags int
+
+const (
+	syncFlagHasFd syncFlags = (1 << iota)
+)
+
 type syncT struct {
-	Type syncType `json:"type"`
-	Fd   int      `json:"fd"`
+	Type  syncType         `json:"type"`
+	Flags syncFlags        `json:"flags"`
+	Arg   *json.RawMessage `json:"arg,omitempty"`
+	File  *os.File         `json:"-"` // passed oob through SCM_RIGHTS
 }
 
 // initError is used to wrap errors for passing them via JSON,
@@ -50,74 +70,100 @@ func (i initError) Error() string {
 	return i.Message
 }
 
-// writeSync is used to write to a synchronisation pipe. An error is returned
-// if there was a problem writing the payload.
-func writeSync(pipe io.Writer, sync syncType) error {
-	return writeSyncWithFd(pipe, sync, -1)
+func doWriteSync(pipe *os.File, sync syncT) error {
+	sync.Flags &= ^syncFlagHasFd
+	if sync.File != nil {
+		sync.Flags |= syncFlagHasFd
+	}
+	if err := utils.WriteJSON(pipe, sync); err != nil {
+		return fmt.Errorf("writing sync %q: %w", sync.Type, err)
+	}
+	if sync.Flags&syncFlagHasFd != 0 {
+		if err := utils.SendFile(pipe, sync.File); err != nil {
+			return fmt.Errorf("sending file after sync %q: %w", sync.Type, err)
+		}
+	}
+	return nil
 }
 
-// writeSyncWithFd is used to write to a synchronisation pipe. An error is
-// returned if there was a problem writing the payload.
-func writeSyncWithFd(pipe io.Writer, sync syncType, fd int) error {
-	if err := utils.WriteJSON(pipe, syncT{sync, fd}); err != nil {
-		return fmt.Errorf("writing syncT %q: %w", string(sync), err)
+func writeSync(pipe *os.File, sync syncType) error {
+	return doWriteSync(pipe, syncT{Type: sync})
+}
+
+func writeSyncArg(pipe *os.File, sync syncType, arg interface{}) error {
+	argJSON, err := json.Marshal(arg)
+	if err != nil {
+		return fmt.Errorf("writing sync %q: marshal argument failed: %w", sync, err)
 	}
-	return nil
+	argJSONMsg := json.RawMessage(argJSON)
+	return doWriteSync(pipe, syncT{Type: sync, Arg: &argJSONMsg})
 }
 
-// readSync is used to read from a synchronisation pipe. An error is returned
-// if we got an initError, the pipe was closed, or we got an unexpected flag.
-func readSync(pipe io.Reader, expected syncType) error {
-	var procSync syncT
-	if err := json.NewDecoder(pipe).Decode(&procSync); err != nil {
+func doReadSync(pipe *os.File) (syncT, error) {
+	var sync syncT
+	if err := json.NewDecoder(pipe).Decode(&sync); err != nil {
 		if errors.Is(err, io.EOF) {
-			return errors.New("parent closed synchronisation channel")
+			return sync, err
 		}
-		return fmt.Errorf("failed reading error from parent: %w", err)
+		return sync, fmt.Errorf("reading from parent failed: %w", err)
 	}
-
-	if procSync.Type == procError {
+	if sync.Type == procError {
 		var ierr initError
-
-		if err := json.NewDecoder(pipe).Decode(&ierr); err != nil {
-			return fmt.Errorf("failed reading error from parent: %w", err)
+		if sync.Arg == nil {
+			return sync, errors.New("procError missing error payload")
+		}
+		if err := json.Unmarshal(*sync.Arg, &ierr); err != nil {
+			return sync, fmt.Errorf("unmarshal procError failed: %w", err)
+		}
+		return sync, &ierr
+	}
+	if sync.Flags&syncFlagHasFd != 0 {
+		file, err := utils.RecvFile(pipe)
+		if err != nil {
+			return sync, fmt.Errorf("receiving fd from sync %q failed: %w", sync.Type, err)
 		}
+		sync.File = file
+	}
+	return sync, nil
+}
 
-		return &ierr
+func readSyncFull(pipe *os.File, expected syncType) (syncT, error) {
+	sync, err := doReadSync(pipe)
+	if err != nil {
+		return sync, err
 	}
+	if sync.Type != expected {
+		return sync, fmt.Errorf("unexpected synchronisation flag: got %q, expected %q", sync.Type, expected)
+	}
+	return sync, nil
+}
 
-	if procSync.Type != expected {
-		return errors.New("invalid synchronisation flag from parent")
+func readSync(pipe *os.File, expected syncType) error {
+	sync, err := readSyncFull(pipe, expected)
+	if err != nil {
+		return err
+	}
+	if sync.Arg != nil {
+		return fmt.Errorf("sync %q had unexpected argument passed: %q", expected, string(*sync.Arg))
+	}
+	if sync.File != nil {
+		_ = sync.File.Close()
+		return fmt.Errorf("sync %q had unexpected file passed", sync.Type)
 	}
 	return nil
 }
 
 // parseSync runs the given callback function on each syncT received from the
 // child. It will return once io.EOF is returned from the given pipe.
-func parseSync(pipe io.Reader, fn func(*syncT) error) error {
-	dec := json.NewDecoder(pipe)
+func parseSync(pipe *os.File, fn func(*syncT) error) error {
 	for {
-		var sync syncT
-		if err := dec.Decode(&sync); err != nil {
+		sync, err := doReadSync(pipe)
+		if err != nil {
 			if errors.Is(err, io.EOF) {
 				break
 			}
 			return err
 		}
-
-		// We handle this case outside fn for cleanliness reasons.
-		var ierr *initError
-		if sync.Type == procError {
-			if err := dec.Decode(&ierr); err != nil && !errors.Is(err, io.EOF) {
-				return fmt.Errorf("error decoding proc error from init: %w", err)
-			}
-			if ierr != nil {
-				return ierr
-			}
-			// Programmer error.
-			panic("No error following JSON procError payload.")
-		}
-
 		if err := fn(&sync); err != nil {
 			return err
 		}
diff --git a/libcontainer/system/linux.go b/libcontainer/system/linux.go
index e1d6eb18034..200d375e840 100644
--- a/libcontainer/system/linux.go
+++ b/libcontainer/system/linux.go
@@ -4,10 +4,14 @@
 package system
 
 import (
+	"errors"
+	"fmt"
+	"io"
 	"os"
 	"os/exec"
 	"unsafe"
 
+	"github.com/sirupsen/logrus"
 	"golang.org/x/sys/unix"
 )
 
@@ -102,3 +106,40 @@ func GetSubreaper() (int, error) {
 
 	return int(i), nil
 }
+
+func ExecutableMemfd(comment string, flags int) (*os.File, error) {
+	// Try to use MFD_EXEC first. On pre-6.3 kernels we get -EINVAL for this
+	// flag. On post-6.3 kernels, with vm.memfd_noexec=1 this ensures we get an
+	// executable memfd. For vm.memfd_noexec=2 this is a bit more complicated.
+	// The original vm.memfd_noexec=2 implementation incorrectly silently
+	// allowed MFD_EXEC[1] -- this should be fixed in 6.6. On 6.6 and newer
+	// kernels, we will get -EACCES if we try to use MFD_EXEC with
+	// vm.memfd_noexec=2 (for 6.3-6.5, -EINVAL was the intended return value).
+	//
+	// The upshot is we only need to retry without MFD_EXEC on -EINVAL because
+	// it just so happens that passing MFD_EXEC bypasses vm.memfd_noexec=2 on
+	// kernels where -EINVAL is actually a security denial.
+	memfd, err := unix.MemfdCreate(comment, flags|unix.MFD_EXEC)
+	if errors.Is(err, unix.EINVAL) {
+		memfd, err = unix.MemfdCreate(comment, flags)
+	}
+	if err != nil {
+		if errors.Is(err, unix.EACCES) {
+			logrus.Infof("memfd_create(MFD_EXEC) failed, possibly due to vm.memfd_noexec=2 -- falling back to less secure O_TMPFILE")
+		}
+		err := os.NewSyscallError("memfd_create", err)
+		return nil, fmt.Errorf("failed to create executable memfd: %w", err)
+	}
+	return os.NewFile(uintptr(memfd), "/memfd:"+comment), nil
+}
+
+// Copy is a wrapper around io.Copy that continues the copy despite EINTR.
+func Copy(dst io.Writer, src io.Reader) (written int64, err error) {
+	for {
+		n, err := io.Copy(dst, src)
+		written += n
+		if !errors.Is(err, unix.EINTR) {
+			return written, err
+		}
+	}
+}
diff --git a/libcontainer/userns/usernsfd_linux.go b/libcontainer/userns/usernsfd_linux.go
new file mode 100644
index 00000000000..0f518c5d5fe
--- /dev/null
+++ b/libcontainer/userns/usernsfd_linux.go
@@ -0,0 +1,134 @@
+package userns
+
+import (
+	"fmt"
+	"os"
+	"sort"
+	"strings"
+	"sync"
+	"syscall"
+
+	"github.com/sirupsen/logrus"
+	"golang.org/x/sys/unix"
+
+	"github.com/opencontainers/runc/libcontainer/configs"
+)
+
+type Mapping struct {
+	UIDMappings []configs.IDMap
+	GIDMappings []configs.IDMap
+}
+
+func (m Mapping) toSys() (uids, gids []syscall.SysProcIDMap) {
+	for _, uid := range m.UIDMappings {
+		uids = append(uids, syscall.SysProcIDMap{
+			ContainerID: uid.ContainerID,
+			HostID:      uid.HostID,
+			Size:        uid.Size,
+		})
+	}
+	for _, gid := range m.GIDMappings {
+		gids = append(gids, syscall.SysProcIDMap{
+			ContainerID: gid.ContainerID,
+			HostID:      gid.HostID,
+			Size:        gid.Size,
+		})
+	}
+	return
+}
+
+// id returns a unique identifier for this mapping, agnostic of the order of
+// the uid and gid mappings (because the order doesn't matter to the kernel).
+// The set of userns handles is indexed using this ID.
+func (m Mapping) id() string {
+	var uids, gids []string
+	for _, idmap := range m.UIDMappings {
+		uids = append(uids, fmt.Sprintf("%d:%d:%d", idmap.ContainerID, idmap.HostID, idmap.Size))
+	}
+	for _, idmap := range m.GIDMappings {
+		gids = append(gids, fmt.Sprintf("%d:%d:%d", idmap.ContainerID, idmap.HostID, idmap.Size))
+	}
+	// We don't care about the sort order -- just sort them.
+	sort.Strings(uids)
+	sort.Strings(gids)
+	return "uid=" + strings.Join(uids, ",") + ";gid=" + strings.Join(gids, ",")
+}
+
+type Handles struct {
+	m     sync.Mutex
+	procs map[string]*os.Process
+}
+
+// Release kills all of the child processes created with this Handle, releasing
+// the resources used to create existing userns handles. The kernel guarantees
+// all existing files returned from Get() will continue to work even after
+// calling Release(). The same Handles can be re-used after calling Release().
+func (hs *Handles) Release() {
+	hs.m.Lock()
+	defer hs.m.Unlock()
+
+	for _, proc := range hs.procs {
+		// We can't do anything about errors here, and in the case of runc we
+		// don't care because in the worst case we end up with zombies that
+		// pid1 can deal with.
+		_ = proc.Kill()
+		_, _ = proc.Wait()
+	}
+	hs.procs = make(map[string]*os.Process)
+}
+
+func spawnProc(req Mapping) (*os.Process, error) {
+	// We need to spawn a subprocess with the requested mappings, which is
+	// unfortunately quite expensive. The "safe" way of doing this is natively
+	// with Go (and then spawning something like "sleep infinity"), but
+	// execve() is a waste of cycles because we just need some process to have
+	// the right mapping, we don't care what it's executing. The "unsafe"
+	// option of doing a clone() behind the back of Go is probably okay in
+	// theory as long as we just do kill(getpid(), SIGSTOP). However, if we
+	// tell Go to put the new process into PTRACE_TRACEME mode, we can avoid
+	// the exec and not have to faff around with the mappings.
+	//
+	// Note that Go's stdlib does not support newuidmap, but in the case of
+	// id-mapped mounts, it seems incredibly unlikely that the user will be
+	// requesting us to do a remapping as an unprivileged user with mappings
+	// they have privileges over.
+	logrus.Debugf("spawning dummy process for id-mapping %s", req.id())
+	uidMappings, gidMappings := req.toSys()
+	return os.StartProcess("/proc/self/exe", []string{"runc", "--help"}, &os.ProcAttr{
+		Sys: &syscall.SysProcAttr{
+			Cloneflags:                 unix.CLONE_NEWUSER,
+			UidMappings:                uidMappings,
+			GidMappings:                gidMappings,
+			GidMappingsEnableSetgroups: false,
+			// Put the process into PTRACE_TRACEME mode to allow us to get the
+			// userns without having a proper execve() target.
+			Ptrace: true,
+		},
+	})
+}
+
+// Get returns a handle to a /proc/$pid/ns/user nsfs file with the requested
+// mapping. The processes spawned to produce userns nsfds are cached, so if
+// equivalent user namespace mappings are requested, the same user namespace
+// will be returned. The caller is responsible for closing the returned file
+// descriptor.
+func (hs *Handles) Get(req Mapping) (f *os.File, err error) {
+	hs.m.Lock()
+	defer hs.m.Unlock()
+
+	if hs.procs == nil {
+		hs.procs = make(map[string]*os.Process)
+	}
+
+	proc, ok := hs.procs[req.id()]
+	if !ok {
+		proc, err = spawnProc(req)
+		if err != nil {
+			return nil, fmt.Errorf("failed to spawn dummy process for map %s: %w", req.id(), err)
+		}
+		// TODO: Maybe we should just cache handles to /proc/$pid/ns/user, so we
+		//       can kill the process immediately?
+		hs.procs[req.id()] = proc
+	}
+	return os.Open(fmt.Sprintf("/proc/%d/ns/user", proc.Pid))
+}
diff --git a/libcontainer/userns/usernsfd_linux_test.go b/libcontainer/userns/usernsfd_linux_test.go
new file mode 100644
index 00000000000..c28f55d8eff
--- /dev/null
+++ b/libcontainer/userns/usernsfd_linux_test.go
@@ -0,0 +1,52 @@
+package userns
+
+import (
+	"os"
+	"testing"
+
+	"github.com/opencontainers/runc/libcontainer/configs"
+)
+
+func BenchmarkSpawnProc(b *testing.B) {
+	if os.Geteuid() != 0 {
+		b.Skip("spawning user namespaced processes requires root")
+	}
+
+	// We can re-use the mapping as we call spawnProc() directly.
+	mapping := Mapping{
+		UIDMappings: []configs.IDMap{
+			{ContainerID: 0, HostID: 1337, Size: 142},
+			{ContainerID: 150, HostID: 0, Size: 1},
+			{ContainerID: 442, HostID: 1111, Size: 12},
+			{ContainerID: 1000, HostID: 9999, Size: 92},
+			{ContainerID: 9999, HostID: 1000000, Size: 4},
+			// Newer kernels support more than 5 entries, but stick to 5 here.
+		},
+		GIDMappings: []configs.IDMap{
+			{ContainerID: 1, HostID: 2337, Size: 142},
+			{ContainerID: 145, HostID: 6, Size: 1},
+			{ContainerID: 200, HostID: 1000, Size: 12},
+			{ContainerID: 1000, HostID: 9888, Size: 92},
+			{ContainerID: 8999, HostID: 1000000, Size: 4},
+			// Newer kernels support more than 5 entries, but stick to 5 here.
+		},
+	}
+
+	procs := make([]*os.Process, 0, b.N)
+	b.Cleanup(func() {
+		for _, proc := range procs {
+			if proc != nil {
+				_ = proc.Kill()
+				_, _ = proc.Wait()
+			}
+		}
+	})
+
+	for i := 0; i < b.N; i++ {
+		proc, err := spawnProc(mapping)
+		if err != nil {
+			b.Error(err)
+		}
+		procs = append(procs, proc)
+	}
+}
diff --git a/libcontainer/utils/cmsg.go b/libcontainer/utils/cmsg.go
index fd9326cb59a..c4c122e41fe 100644
--- a/libcontainer/utils/cmsg.go
+++ b/libcontainer/utils/cmsg.go
@@ -19,13 +19,14 @@ package utils
 import (
 	"fmt"
 	"os"
+	"runtime"
 
 	"golang.org/x/sys/unix"
 )
 
-// MaxNameLen is the maximum length of the name of a file descriptor being
-// sent using SendFd. The name of the file handle returned by RecvFd will never
-// be larger than this value.
+// MaxNameLen is the maximum length of the name of a file descriptor being sent
+// using SendFd. The name of the file handle returned by RecvFile will never be
+// larger than this value.
 const MaxNameLen = 4096
 
 // oobSpace is the size of the oob slice required to store a single FD. Note
@@ -33,26 +34,21 @@ const MaxNameLen = 4096
 // so sizeof(fd) = 4.
 var oobSpace = unix.CmsgSpace(4)
 
-// RecvFd waits for a file descriptor to be sent over the given AF_UNIX
+// RecvFile waits for a file descriptor to be sent over the given AF_UNIX
 // socket. The file name of the remote file descriptor will be recreated
 // locally (it is sent as non-auxiliary data in the same payload).
-func RecvFd(socket *os.File) (*os.File, error) {
-	// For some reason, unix.Recvmsg uses the length rather than the capacity
-	// when passing the msg_controllen and other attributes to recvmsg.  So we
-	// have to actually set the length.
+func RecvFile(socket *os.File) (_ *os.File, Err error) {
 	name := make([]byte, MaxNameLen)
 	oob := make([]byte, oobSpace)
 
 	sockfd := socket.Fd()
-	n, oobn, _, _, err := unix.Recvmsg(int(sockfd), name, oob, 0)
+	n, oobn, _, _, err := unix.Recvmsg(int(sockfd), name, oob, unix.MSG_CMSG_CLOEXEC)
 	if err != nil {
 		return nil, err
 	}
-
 	if n >= MaxNameLen || oobn != oobSpace {
 		return nil, fmt.Errorf("recvfd: incorrect number of bytes read (n=%d oobn=%d)", n, oobn)
 	}
-
 	// Truncate.
 	name = name[:n]
 	oob = oob[:oobn]
@@ -61,36 +57,61 @@ func RecvFd(socket *os.File) (*os.File, error) {
 	if err != nil {
 		return nil, err
 	}
-	if len(scms) != 1 {
-		return nil, fmt.Errorf("recvfd: number of SCMs is not 1: %d", len(scms))
+
+	// We cannot control how many SCM_RIGHTS we receive, and upon receiving
+	// them all of the descriptors are installed in our fd table, so we need to
+	// parse all of the SCM_RIGHTS we received in order to close all of the
+	// descriptors on error.
+	var fds []int
+	defer func() {
+		for i, fd := range fds {
+			// Only close 0 if err != nil, and close everything else.
+			if i != 0 || err != nil {
+				_ = unix.Close(fd)
+			}
+		}
+	}()
+	var lastErr error
+	for _, scm := range scms {
+		if scm.Header.Type == unix.SCM_RIGHTS {
+			scmFds, err := unix.ParseUnixRights(&scm)
+			if err != nil {
+				lastErr = err
+			} else {
+				fds = append(fds, scmFds...)
+			}
+		}
+	}
+	if lastErr != nil {
+		return nil, lastErr
 	}
-	scm := scms[0]
 
-	fds, err := unix.ParseUnixRights(&scm)
-	if err != nil {
-		return nil, err
+	// We do this after collecting the fds to make sure we close them all when
+	// returning an error here.
+	if len(scms) != 1 {
+		return nil, fmt.Errorf("recvfd: number of SCMs is not 1: %d", len(scms))
 	}
 	if len(fds) != 1 {
 		return nil, fmt.Errorf("recvfd: number of fds is not 1: %d", len(fds))
 	}
-	fd := uintptr(fds[0])
-
-	return os.NewFile(fd, string(name)), nil
+	return os.NewFile(uintptr(fds[0]), string(name)), nil
 }
 
-// SendFd sends a file descriptor over the given AF_UNIX socket. In
-// addition, the file.Name() of the given file will also be sent as
-// non-auxiliary data in the same payload (allowing to send contextual
-// information for a file descriptor).
-func SendFd(socket *os.File, name string, fd uintptr) error {
+// SendFile sends a file over the given AF_UNIX socket. file.Name() is also
+// included so that if the other end uses RecvFile, the file will have the same
+// name information.
+func SendFile(socket *os.File, file *os.File) error {
+	name := file.Name()
 	if len(name) >= MaxNameLen {
 		return fmt.Errorf("sendfd: filename too long: %s", name)
 	}
-	return SendFds(socket, []byte(name), int(fd))
+	err := SendRawFd(socket, name, file.Fd())
+	runtime.KeepAlive(file)
+	return err
 }
 
-// SendFds sends a list of files descriptor and msg over the given AF_UNIX socket.
-func SendFds(socket *os.File, msg []byte, fds ...int) error {
-	oob := unix.UnixRights(fds...)
-	return unix.Sendmsg(int(socket.Fd()), msg, oob, nil, 0)
+// SendRawFd sends a specific file descriptor over the given AF_UNIX socket.
+func SendRawFd(socket *os.File, msg string, fd uintptr) error {
+	oob := unix.UnixRights(int(fd))
+	return unix.Sendmsg(int(socket.Fd()), []byte(msg), oob, nil, 0)
 }
diff --git a/libcontainer/utils/utils_unix.go b/libcontainer/utils/utils_unix.go
index 6b9a7be038f..ca520b63b36 100644
--- a/libcontainer/utils/utils_unix.go
+++ b/libcontainer/utils/utils_unix.go
@@ -91,7 +91,7 @@ func CloseExecFrom(minFd int) error {
 }
 
 // NewSockPair returns a new unix socket pair
-func NewSockPair(name string) (parent *os.File, child *os.File, err error) {
+func NewSockPair(name string) (parent, child *os.File, err error) {
 	fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0)
 	if err != nil {
 		return nil, nil, err
diff --git a/tests/integration/idmap.bats b/tests/integration/idmap.bats
index 375191bf705..0dac40b21e7 100644
--- a/tests/integration/idmap.bats
+++ b/tests/integration/idmap.bats
@@ -2,6 +2,9 @@
 
 load helpers
 
+OVERFLOW_UID="$(cat /proc/sys/kernel/overflowuid)"
+OVERFLOW_GID="$(cat /proc/sys/kernel/overflowgid)"
+
 function setup() {
 	requires root
 	requires_kernel 5.12
@@ -9,152 +12,306 @@ function setup() {
 
 	setup_debian
 
-	# Prepare source folders for bind mount
-	mkdir -p source-{1,2}/
-	touch source-{1,2}/foo.txt
+	# Prepare source folders for mounts.
+	mkdir -p source-{1,2,multi{1,2,3}}/
+	touch source-{1,2,multi{1,2,3}}/foo.txt
+	touch source-multi{1,2,3}/{bar,baz}.txt
 
-	# Use other owner for source-2
+	# Change the owners for everything other than source-1.
 	chown 1:1 source-2/foo.txt
 
-	mkdir -p rootfs/{proc,sys,tmp}
-	mkdir -p rootfs/tmp/mount-{1,2}
-	mkdir -p rootfs/mnt/bind-mount-{1,2}
-
-	update_config ' .linux.namespaces += [{"type": "user"}]
-		| .linux.uidMappings += [{"hostID": 100000, "containerID": 0, "size": 65536}]
-		| .linux.gidMappings += [{"hostID": 100000, "containerID": 0, "size": 65536}]
-		| .process.args += ["-c", "stat -c =%u=%g= /tmp/mount-1/foo.txt"]
-		| .mounts += [
-					{
-						"source": "source-1/",
-						"destination": "/tmp/mount-1",
-						"options": ["bind"],
-						"uidMappings": [ {
-						                  "containerID": 0,
-						                  "hostID": 100000,
-						                  "size": 65536
-						                }
-						],
-						"gidMappings": [        {
-						                  "containerID": 0,
-						                  "hostID": 100000,
-						                  "size": 65536
-						                }
-						]
-					}
-				] '
+	chown 100:211 source-multi1/foo.txt
+	chown 101:222 source-multi1/bar.txt
+	chown 102:233 source-multi1/baz.txt
+
+	# Same gids as multi1, different uids.
+	chown 200:211 source-multi2/foo.txt
+	chown 201:222 source-multi2/bar.txt
+	chown 202:233 source-multi2/baz.txt
+
+	# 1000 uids, 500 gids
+	chown 5000528:6000491 source-multi3/foo.txt
+	chown 5000133:6000337 source-multi3/bar.txt
+	chown 5000999:6000444 source-multi3/baz.txt
 }
 
 function teardown() {
 	teardown_bundle
 }
 
-@test "simple idmap mount" {
+function setup_idmap_userns() {
+	update_config '.linux.namespaces += [{"type": "user"}]
+		| .linux.uidMappings += [{"containerID": 0, "hostID": 100000, "size": 65536}]
+		| .linux.gidMappings += [{"containerID": 0, "hostID": 100000, "size": 65536}]'
+}
+
+function setup_bind_mount() {
+	mountname="${1:-1}"
+	update_config '.mounts += [
+			{
+				"source": "source-'"$mountname"'/",
+				"destination": "/tmp/bind-mount-'"$mountname"'",
+				"options": ["bind"]
+			}
+		]'
+}
+
+function setup_idmap_single_mount() {
+	uidmap="$1"       # ctr:host:size
+	gidmap="${2:-$1}" # ctr:host:size
+	mountname="${3:-1}"
+	destname="${4:-$mountname}"
+
+	read -r uid_containerID uid_hostID uid_size <<<"$(tr : ' ' <<<"$uidmap")"
+	read -r gid_containerID gid_hostID gid_size <<<"$(tr : ' ' <<<"$gidmap")"
+
+	update_config '.mounts += [
+			{
+				"source": "source-'"$mountname"'/",
+				"destination": "/tmp/mount-'"$destname"'",
+				"options": ["bind"],
+				"uidMappings": [{"containerID": '"$uid_containerID"', "hostID": '"$uid_hostID"', "size": '"$uid_size"'}],
+				"gidMappings": [{"containerID": '"$gid_containerID"', "hostID": '"$gid_hostID"', "size": '"$gid_size"'}]
+			}
+		]'
+}
+
+function setup_idmap_basic_mount() {
+	mountname="${1:-1}"
+	setup_idmap_single_mount 0:100000:65536 0:100000:65536 "$mountname"
+}
+
+@test "simple idmap mount [userns]" {
+	setup_idmap_userns
+	setup_idmap_basic_mount
+
+	update_config '.process.args = ["sh", "-c", "stat -c =%u=%g= /tmp/mount-1/foo.txt"]'
+
 	runc run test_debian
 	[ "$status" -eq 0 ]
 	[[ "$output" == *"=0=0="* ]]
 }
 
-@test "write to an idmap mount" {
-	update_config ' .process.args = ["sh", "-c", "touch /tmp/mount-1/bar && stat -c =%u=%g= /tmp/mount-1/bar"]'
+@test "simple idmap mount [no userns]" {
+	setup_idmap_basic_mount
+
+	update_config '.process.args = ["sh", "-c", "stat -c =%u=%g= /tmp/mount-1/foo.txt"]'
+
+	runc run test_debian
+	[ "$status" -eq 0 ]
+	[[ "$output" == *"=100000=100000="* ]]
+}
+
+@test "write to an idmap mount [userns]" {
+	setup_idmap_userns
+	setup_idmap_basic_mount
+
+	update_config '.process.args = ["sh", "-c", "touch /tmp/mount-1/bar && stat -c =%u=%g= /tmp/mount-1/bar"]'
+
 	runc run test_debian
 	[ "$status" -eq 0 ]
 	[[ "$output" == *"=0=0="* ]]
 }
 
-@test "idmap mount with propagation flag" {
-	update_config ' .process.args = ["sh", "-c", "findmnt -o PROPAGATION /tmp/mount-1"]'
+@test "write to an idmap mount [no userns]" {
+	setup_idmap_basic_mount
+
+	update_config '.process.args = ["sh", "-c", "touch /tmp/mount-1/bar && stat -c =%u=%g= /tmp/mount-1/bar"]'
+
+	runc run test_debian
+	# The write must fail because the user is unmapped.
+	[ "$status" -ne 0 ]
+	[[ "$output" == *"Value too large for defined data type"* ]] # ERANGE
+}
+
+@test "idmap mount with propagation flag [userns]" {
+	setup_idmap_userns
+	setup_idmap_basic_mount
+
+	update_config '.process.args = ["sh", "-c", "findmnt -o PROPAGATION /tmp/mount-1"]'
 	# Add the shared option to the idmap mount
-	update_config ' .mounts |= map((select(.source == "source-1/") | .options += ["shared"]) // .)'
+	update_config '.mounts |= map((select(.source == "source-1/") | .options += ["shared"]) // .)'
 
 	runc run test_debian
 	[ "$status" -eq 0 ]
 	[[ "$output" == *"shared"* ]]
 }
 
-@test "idmap mount with bind mount" {
-	update_config '   .mounts += [
-					{
-						"source": "source-2/",
-						"destination": "/tmp/mount-2",
-						"options": ["bind"]
-					}
-				] '
+@test "idmap mount with bind mount [userns]" {
+	setup_idmap_userns
+	setup_idmap_basic_mount
+	setup_bind_mount
+
+	update_config '.process.args = ["bash", "-c", "stat -c =%n:%u=%g= /tmp/{,bind-}mount-1/foo.txt"]'
 
 	runc run test_debian
 	[ "$status" -eq 0 ]
-	[[ "$output" == *"=0=0="* ]]
+	[[ "$output" == *"=/tmp/mount-1/foo.txt:0=0="* ]]
+	[[ "$output" == *"=/tmp/bind-mount-1/foo.txt:$OVERFLOW_UID=$OVERFLOW_GID="* ]]
 }
 
-@test "two idmap mounts with two bind mounts" {
-	update_config '   .process.args = ["sh", "-c", "stat -c =%u=%g= /tmp/mount-1/foo.txt /tmp/mount-2/foo.txt"]
-			| .mounts += [
-					{
-						"source": "source-1/",
-						"destination": "/mnt/bind-mount-1",
-						"options": ["bind"]
-					},
-					{
-						"source": "source-2/",
-						"destination": "/mnt/bind-mount-2",
-						"options": ["bind"]
-					},
-					{
-						"source": "source-2/",
-						"destination": "/tmp/mount-2",
-						"options": ["bind"],
-						"uidMappings": [ {
-						                  "containerID": 0,
-						                  "hostID": 100000,
-						                  "size": 65536
-						                }
-						],
-						"gidMappings": [        {
-						                  "containerID": 0,
-						                  "hostID": 100000,
-						                  "size": 65536
-						                }
-						]
-					}
-				] '
+@test "idmap mount with bind mount [no userns]" {
+	setup_idmap_basic_mount
+	setup_bind_mount
+
+	update_config '.process.args = ["bash", "-c", "stat -c =%n:%u=%g= /tmp/{,bind-}mount-1/foo.txt"]'
 
 	runc run test_debian
 	[ "$status" -eq 0 ]
-	[[ "$output" == *"=0=0="* ]]
-	# source-2/ is owned by 1:1, so we expect this with the idmap mount too.
-	[[ "$output" == *"=1=1="* ]]
+	[[ "$output" == *"=/tmp/mount-1/foo.txt:100000=100000="* ]]
+	[[ "$output" == *"=/tmp/bind-mount-1/foo.txt:0=0="* ]]
+}
+
+@test "two idmap mounts (same mapping) with two bind mounts [userns]" {
+	setup_idmap_userns
+
+	setup_idmap_basic_mount 1
+	setup_bind_mount 1
+	setup_bind_mount 2
+	setup_idmap_basic_mount 2
+
+	update_config '.process.args = ["bash", "-c", "stat -c =%n:%u=%g= /tmp/mount-[12]/foo.txt"]'
+
+	runc run test_debian
+	[ "$status" -eq 0 ]
+	[[ "$output" == *"=/tmp/mount-1/foo.txt:0=0="* ]]
+	[[ "$output" == *"=/tmp/mount-2/foo.txt:1=1="* ]]
+}
+
+@test "same idmap mount (different mappings) [userns]" {
+	setup_idmap_userns
+
+	# Mount the same directory with different mappings. Make sure we also use
+	# different mappings for uids and gids.
+	setup_idmap_single_mount 100:100000:100 200:100000:100 multi1
+	setup_idmap_single_mount 100:101000:100 200:102000:100 multi1 multi1-alt
+
+	update_config '.process.args = ["bash", "-c", "stat -c =%n:%u=%g= /tmp/mount-multi1{,-alt}/{foo,bar,baz}.txt"]'
+
+	runc run test_debian
+	[ "$status" -eq 0 ]
+	[[ "$output" == *"=/tmp/mount-multi1/foo.txt:0=11="* ]]
+	[[ "$output" == *"=/tmp/mount-multi1/bar.txt:1=22="* ]]
+	[[ "$output" == *"=/tmp/mount-multi1/baz.txt:2=33="* ]]
+	[[ "$output" == *"=/tmp/mount-multi1-alt/foo.txt:1000=2011="* ]]
+	[[ "$output" == *"=/tmp/mount-multi1-alt/bar.txt:1001=2022="* ]]
+	[[ "$output" == *"=/tmp/mount-multi1-alt/baz.txt:1002=2033="* ]]
 }
 
-@test "idmap mount without gidMappings fails" {
-	update_config ' .mounts |= map((select(.source == "source-1/") | del(.gidMappings) ) // .)'
+@test "same idmap mount (different mappings) [no userns]" {
+	# Mount the same directory with different mappings. Make sure we also use
+	# different mappings for uids and gids.
+	setup_idmap_single_mount 100:100000:100 200:100000:100 multi1
+	setup_idmap_single_mount 100:101000:100 200:102000:100 multi1 multi1-alt
+
+	update_config '.process.args = ["bash", "-c", "stat -c =%n:%u=%g= /tmp/mount-multi1{,-alt}/{foo,bar,baz}.txt"]'
 
 	runc run test_debian
-	[ "$status" -eq 1 ]
-	[[ "${output}" == *"invalid mount"* ]]
+	[ "$status" -eq 0 ]
+	[[ "$output" == *"=/tmp/mount-multi1/foo.txt:100000=100011="* ]]
+	[[ "$output" == *"=/tmp/mount-multi1/bar.txt:100001=100022="* ]]
+	[[ "$output" == *"=/tmp/mount-multi1/baz.txt:100002=100033="* ]]
+	[[ "$output" == *"=/tmp/mount-multi1-alt/foo.txt:101000=102011="* ]]
+	[[ "$output" == *"=/tmp/mount-multi1-alt/bar.txt:101001=102022="* ]]
+	[[ "$output" == *"=/tmp/mount-multi1-alt/baz.txt:101002=102033="* ]]
 }
 
-@test "idmap mount without uidMappings fails" {
-	update_config ' .mounts |= map((select(.source == "source-1/") | del(.uidMappings) ) // .)'
+@test "multiple idmap mounts (different mappings) [userns]" {
+	setup_idmap_userns
+
+	# Make sure we use different mappings for uids and gids.
+	setup_idmap_single_mount 100:101100:3 200:101900:50 multi1
+	setup_idmap_single_mount 200:102200:3 200:102900:100 multi2
+	setup_idmap_single_mount 5000000:103000:1000 6000000:103000:500 multi3
+
+	update_config '.process.args = ["bash", "-c", "stat -c =%n:%u=%g= /tmp/mount-multi[123]/{foo,bar,baz}.txt"]'
 
 	runc run test_debian
-	[ "$status" -eq 1 ]
-	[[ "${output}" == *"invalid mount"* ]]
+	[ "$status" -eq 0 ]
+	[[ "$output" == *"=/tmp/mount-multi1/foo.txt:1100=1911="* ]]
+	[[ "$output" == *"=/tmp/mount-multi1/bar.txt:1101=1922="* ]]
+	[[ "$output" == *"=/tmp/mount-multi1/baz.txt:1102=1933="* ]]
+	[[ "$output" == *"=/tmp/mount-multi2/foo.txt:2200=2911="* ]]
+	[[ "$output" == *"=/tmp/mount-multi2/bar.txt:2201=2922="* ]]
+	[[ "$output" == *"=/tmp/mount-multi2/baz.txt:2202=2933="* ]]
+	[[ "$output" == *"=/tmp/mount-multi3/foo.txt:3528=3491="* ]]
+	[[ "$output" == *"=/tmp/mount-multi3/bar.txt:3133=3337="* ]]
+	[[ "$output" == *"=/tmp/mount-multi3/baz.txt:3999=3444="* ]]
 }
 
-@test "idmap mount without bind fails" {
-	update_config ' .mounts |= map((select(.source == "source-1/") | .options = [""]) // .)'
+@test "multiple idmap mounts (different mappings) [no userns]" {
+	# Make sure we use different mappings for uids and gids.
+	setup_idmap_single_mount 100:1100:3 200:1900:50 multi1
+	setup_idmap_single_mount 200:2200:3 200:2900:100 multi2
+	setup_idmap_single_mount 5000000:3000:1000 6000000:3000:500 multi3
+
+	update_config '.process.args = ["bash", "-c", "stat -c =%n:%u=%g= /tmp/mount-multi[123]/{foo,bar,baz}.txt"]'
 
 	runc run test_debian
-	[ "$status" -eq 1 ]
-	[[ "${output}" == *"invalid mount"* ]]
+	[ "$status" -eq 0 ]
+	[[ "$output" == *"=/tmp/mount-multi1/foo.txt:1100=1911="* ]]
+	[[ "$output" == *"=/tmp/mount-multi1/bar.txt:1101=1922="* ]]
+	[[ "$output" == *"=/tmp/mount-multi1/baz.txt:1102=1933="* ]]
+	[[ "$output" == *"=/tmp/mount-multi2/foo.txt:2200=2911="* ]]
+	[[ "$output" == *"=/tmp/mount-multi2/bar.txt:2201=2922="* ]]
+	[[ "$output" == *"=/tmp/mount-multi2/baz.txt:2202=2933="* ]]
+	[[ "$output" == *"=/tmp/mount-multi3/foo.txt:3528=3491="* ]]
+	[[ "$output" == *"=/tmp/mount-multi3/bar.txt:3133=3337="* ]]
+	[[ "$output" == *"=/tmp/mount-multi3/baz.txt:3999=3444="* ]]
 }
 
-@test "idmap mount with different mapping than userns fails" {
-	# Let's modify the containerID to be 1, instead of 0 as it is in the
-	# userns config.
-	update_config ' .mounts |= map((select(.source == "source-1/") | .uidMappings[0]["containerID"] = 1 ) // .)'
+@test "idmap mount (complicated mapping) [userns]" {
+	setup_idmap_userns
 
+	update_config '.mounts += [
+			{
+				"source": "source-multi1/",
+				"destination": "/tmp/mount-multi1",
+				"options": ["bind"],
+				"uidMappings": [
+					{"containerID": 100, "hostID": 101000, "size": 1},
+					{"containerID": 101, "hostID": 102000, "size": 1},
+					{"containerID": 102, "hostID": 103000, "size": 1}
+				],
+				"gidMappings": [
+					{"containerID": 210, "hostID": 101100, "size": 10},
+					{"containerID": 220, "hostID": 102200, "size": 10},
+					{"containerID": 230, "hostID": 103300, "size": 10}
+				]
+			}
+		]'
+
+	update_config '.process.args = ["bash", "-c", "stat -c =%n:%u=%g= /tmp/mount-multi1/{foo,bar,baz}.txt"]'
 	runc run test_debian
-	[ "$status" -eq 1 ]
-	[[ "${output}" == *"invalid mount"* ]]
+	[ "$status" -eq 0 ]
+	[[ "$output" == *"=/tmp/mount-multi1/foo.txt:1000=1101="* ]]
+	[[ "$output" == *"=/tmp/mount-multi1/bar.txt:2000=2202="* ]]
+	[[ "$output" == *"=/tmp/mount-multi1/baz.txt:3000=3303="* ]]
+}
+
+@test "idmap mount (complicated mapping) [no userns]" {
+	update_config '.mounts += [
+			{
+				"source": "source-multi1/",
+				"destination": "/tmp/mount-multi1",
+				"options": ["bind"],
+				"uidMappings": [
+					{"containerID": 100, "hostID": 1000, "size": 1},
+					{"containerID": 101, "hostID": 2000, "size": 1},
+					{"containerID": 102, "hostID": 3000, "size": 1}
+				],
+				"gidMappings": [
+					{"containerID": 210, "hostID": 1100, "size": 10},
+					{"containerID": 220, "hostID": 2200, "size": 10},
+					{"containerID": 230, "hostID": 3300, "size": 10}
+				]
+			}
+		]'
+
+	update_config '.process.args = ["bash", "-c", "stat -c =%n:%u=%g= /tmp/mount-multi1/{foo,bar,baz}.txt"]'
+	runc run test_debian
+	[ "$status" -eq 0 ]
+	[[ "$output" == *"=/tmp/mount-multi1/foo.txt:1000=1101="* ]]
+	[[ "$output" == *"=/tmp/mount-multi1/bar.txt:2000=2202="* ]]
+	[[ "$output" == *"=/tmp/mount-multi1/baz.txt:3000=3303="* ]]
 }
diff --git a/tty.go b/tty.go
index fba3e025bc0..c101aacb78b 100644
--- a/tty.go
+++ b/tty.go
@@ -100,7 +100,7 @@ func (t *tty) initHostConsole() error {
 }
 
 func (t *tty) recvtty(socket *os.File) (Err error) {
-	f, err := utils.RecvFd(socket)
+	f, err := utils.RecvFile(socket)
 	if err != nil {
 		return err
 	}