From 7af783a3253b9c3d4c884cf5bfc32b4057ec4ccf Mon Sep 17 00:00:00 2001 From: lifubang Date: Fri, 21 Feb 2025 01:45:21 +0000 Subject: [PATCH 1/5] use golang to implement some functions in nsexec stage-0 As we want tom move some code from c to go, we should implement them in golang first, for example: UpdateSetgroups, TryMappingTool, UpdateUidmap, UpdateGidmap, UpdateTimeNsOffsets, and UpdateOomScoreAdj. Signed-off-by: lifubang --- libcontainer/configs/config.go | 5 ++ libcontainer/system/linux.go | 116 +++++++++++++++++++++++++++++++++ 2 files changed, 121 insertions(+) diff --git a/libcontainer/configs/config.go b/libcontainer/configs/config.go index 746c7190eb7..1fca4065f18 100644 --- a/libcontainer/configs/config.go +++ b/libcontainer/configs/config.go @@ -27,6 +27,11 @@ type IDMap struct { Size int64 `json:"size"` } +// ToString is to serialize the IDMap to a string. +func (i IDMap) ToString() string { + return fmt.Sprintf("%d %d %d", i.ContainerID, i.HostID, i.Size) +} + // Seccomp represents syscall restrictions // By default, only the native architecture of the kernel is allowed to be used // for syscalls. Additional architectures can be added by specifying them in diff --git a/libcontainer/system/linux.go b/libcontainer/system/linux.go index e8ce0ecac9d..4c655d1d8e4 100644 --- a/libcontainer/system/linux.go +++ b/libcontainer/system/linux.go @@ -3,15 +3,30 @@ package system import ( + "errors" "fmt" "io" "os" + "os/exec" + "strconv" + "strings" "unsafe" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runtime-spec/specs-go" "github.com/sirupsen/logrus" "golang.org/x/sys/unix" ) +// SetgroupsPolicy is used for setgroups policies. +type SetgroupsPolicy int + +const ( + SetgroupsDefault SetgroupsPolicy = iota + 1 + SetgroupsAllow + SetgroupsDeny +) + type ParentDeathSignal int func (p ParentDeathSignal) Restore() error { @@ -169,3 +184,104 @@ func SetLinuxPersonality(personality int) error { } return nil } + +// UpdateSetgroups is to set the process's setgroups policy +// This *must* be called before we touch gid_map. +func UpdateSetgroups(pid int, policy SetgroupsPolicy) error { + var strPolicy string + switch policy { + case SetgroupsAllow: + strPolicy = "allow" + case SetgroupsDeny: + strPolicy = "deny" + case SetgroupsDefault: + fallthrough + default: + return nil + } + err := os.WriteFile("/proc/"+strconv.Itoa(pid)+"/setgroups", []byte(strPolicy), 0) + + // If the kernel is too old to support /proc/pid/setgroups, + // open(2) or write(2) will return ENOENT. This is fine. + if errors.Is(err, unix.ENOENT) { + return nil + } + return err +} + +// TryMappingTool is to try to use the mapping tool to map the uid/gid. +func TryMappingTool(app string, pid int, idMappings []configs.IDMap) error { + if app == "" { + return fmt.Errorf("no mapping tool specified") + } + + argv := []string{strconv.Itoa(pid)} + for _, m := range idMappings { + argv = append(argv, strconv.FormatInt(m.ContainerID, 10), strconv.FormatInt(m.HostID, 10), strconv.FormatInt(m.Size, 10)) + } + cmd := exec.Command(app, argv...) + if err := cmd.Start(); err != nil { + return err + } + return cmd.Wait() +} + +// UpdateUidmap is to update the uid map of the process. +func UpdateUidmap(app string, pid int, uidMappings []configs.IDMap) error { + if len(uidMappings) == 0 { + return nil + } + + data := []string{} + for _, m := range uidMappings { + data = append(data, m.ToString()) + } + logrus.Debugf("update /proc/%d/uid_map to '%s'", pid, strings.Join(data, "\n")) + err := os.WriteFile("/proc/"+strconv.Itoa(pid)+"/uid_map", []byte(strings.Join(data, "\n")), 0) + if errors.Is(err, unix.EPERM) { + logrus.Debugf("update /proc/%d/uid_map got -EPERM (trying %s)", pid, app) + return TryMappingTool(app, pid, uidMappings) + } + return err +} + +// UpdateGidmap is to update the gid map of the process. +func UpdateGidmap(app string, pid int, gidMappings []configs.IDMap) error { + if len(gidMappings) == 0 { + return nil + } + + data := []string{} + for _, m := range gidMappings { + data = append(data, m.ToString()) + } + logrus.Debugf("update /proc/%d/gid_map to '%s'", pid, strings.Join(data, "\n")) + err := os.WriteFile("/proc/"+strconv.Itoa(pid)+"/gid_map", []byte(strings.Join(data, "\n")), 0) + if errors.Is(err, unix.EPERM) { + logrus.Debugf("update /proc/%d/gid_map got -EPERM (trying %s)", pid, app) + return TryMappingTool(app, pid, gidMappings) + } + return err +} + +// UpdateTimeNsOffsets is to update the time namespace offsets of the process. +func UpdateTimeNsOffsets(pid int, offsets map[string]specs.LinuxTimeOffset) error { + if len(offsets) == 0 { + return nil + } + var data []string + for clock, offset := range offsets { + data = append(data, clock+" "+strconv.FormatInt(offset.Secs, 10)+" "+strconv.FormatInt(int64(offset.Nanosecs), 10)) + } + logrus.Debugf("update /proc/%d/timens_offsets to '%s'", pid, strings.Join(data, "\n")) + return os.WriteFile("/proc/"+strconv.Itoa(pid)+"/timens_offsets", []byte(strings.Join(data, "\n")), 0) +} + +// UpdateOomScoreAdj is to update oom_score_adj of the process. +func UpdateOomScoreAdj(oomScoreAdj string) error { + if len(oomScoreAdj) == 0 { + return nil + } + logrus.Debugf("update /proc/self/oom_score_adj to '%s'", oomScoreAdj) + return os.WriteFile("/proc/self/oom_score_adj", []byte(oomScoreAdj), 0) +} From d7c10a9c3adac144c47b72ddd65b88d96dd0dd82 Mon Sep 17 00:00:00 2001 From: lifubang Date: Fri, 21 Feb 2025 02:13:20 +0000 Subject: [PATCH 2/5] use exact param type for function requiresRootOrMappingTool Signed-off-by: lifubang --- libcontainer/container_linux.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index 54a0eaafe06..642e0d482ab 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -1123,7 +1123,7 @@ func (c *Container) bootstrapData(cloneFlags uintptr, nsMaps map[configs.Namespa }) } } - if requiresRootOrMappingTool(c.config) { + if requiresRootOrMappingTool(c.config.GIDMappings) { r.AddData(&Boolmsg{ Type: SetgroupAttr, Value: true, @@ -1186,9 +1186,9 @@ func ignoreTerminateErrors(err error) error { return err } -func requiresRootOrMappingTool(c *configs.Config) bool { +func requiresRootOrMappingTool(gidMappings []configs.IDMap) bool { gidMap := []configs.IDMap{ {ContainerID: 0, HostID: int64(os.Getegid()), Size: 1}, } - return !reflect.DeepEqual(c.GIDMappings, gidMap) + return !reflect.DeepEqual(gidMappings, gidMap) } From 36e2a76eac9cef6d02c1edbeb702ac14f4af8e1d Mon Sep 17 00:00:00 2001 From: lifubang Date: Fri, 21 Feb 2025 02:23:10 +0000 Subject: [PATCH 3/5] implement nsexec stage-0 with golang Signed-off-by: lifubang --- libcontainer/container_setup_linux.go | 147 ++++++++++++++++++++++++++ libcontainer/process_linux.go | 1 + 2 files changed, 148 insertions(+) create mode 100644 libcontainer/container_setup_linux.go diff --git a/libcontainer/container_setup_linux.go b/libcontainer/container_setup_linux.go new file mode 100644 index 00000000000..5e1e37a3f80 --- /dev/null +++ b/libcontainer/container_setup_linux.go @@ -0,0 +1,147 @@ +package libcontainer + +import ( + "encoding/binary" + "fmt" + "io" + "os" + "os/exec" + + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/system" + "github.com/sirupsen/logrus" + "github.com/vishvananda/netlink/nl" + "golang.org/x/sys/unix" +) + +// NsExecSyncMsg is used for communication between the parent and child during +// container setup. +type NsExecSyncMsg uint32 + +const ( + SyncUsermapPls NsExecSyncMsg = iota + 0x40 + SyncUsermapAck + SyncRecvPidPls + SyncRecvPidAck + SyncTimeOffsetsPls + SyncTimeOffsetsAck +) + +const bufSize = 4 + +// setupNsExec is used to help nsexec to setup the container and wait the container's pid. +func (s *containerProcess) setupNsExec(syncSock *os.File) error { + logrus.Debugf("waiting nsexec to report the container's pid") + err := ParseNsExecSync(syncSock, func(msg NsExecSyncMsg) error { + switch msg { + case SyncUsermapPls: + logrus.Debugf("nsexec has requested userns mappings") + if err := s.setupUsermap(); err != nil { + return err + } + return AckNsExecSync(syncSock, SyncUsermapAck) + case SyncTimeOffsetsPls: + logrus.Debugf("nsexec has requested to configure timens offsets") + if err := system.UpdateTimeNsOffsets(s.cmd.Process.Pid, s.container.config.TimeOffsets); err != nil { + return err + } + return AckNsExecSync(syncSock, SyncTimeOffsetsAck) + case SyncRecvPidPls: + logrus.Debugf("nsexec has reported pid") + var pid uint32 + if err := binary.Read(syncSock, nl.NativeEndian(), &pid); err != nil { + return err + } + s.childPid = int(pid) + return AckNsExecSync(syncSock, SyncRecvPidAck) + default: + return fmt.Errorf("unexpected message %d", msg) + } + }) + + return err +} + +// setupUsermap is used to set up the user mappings. +func (s *containerProcess) setupUsermap() error { + var uidMapPath, gidMapPath string + + // Enable setgroups(2) if we've been asked to. But we also have to explicitly + // disable setgroups(2) if we're creating a rootless container for single-entry + // mapping. (this is required since Linux 3.19). + // For rootless multi-entry mapping, we should use newuidmap/newgidmap + // to do mapping user namespace. + if s.config.Config.RootlessEUID && !requiresRootOrMappingTool(s.config.Config.GIDMappings) { + _ = system.UpdateSetgroups(s.cmd.Process.Pid, system.SetgroupsDeny) + } + + nsMaps := make(map[configs.NamespaceType]string) + for _, ns := range s.container.config.Namespaces { + if ns.Path != "" { + nsMaps[ns.Type] = ns.Path + } + } + _, joinExistingUser := nsMaps[configs.NEWUSER] + if !joinExistingUser { + // write uid mappings + if len(s.container.config.UIDMappings) > 0 { + if s.container.config.RootlessEUID { + if path, err := exec.LookPath("newuidmap"); err == nil { + uidMapPath = path + } + } + } + + // write gid mappings + if len(s.container.config.GIDMappings) > 0 { + if s.container.config.RootlessEUID { + if path, err := exec.LookPath("newgidmap"); err == nil { + gidMapPath = path + } + } + } + } + + /* Set up mappings. */ + if err := system.UpdateUidmap(uidMapPath, s.cmd.Process.Pid, s.container.config.UIDMappings); err != nil { + return err + } + return system.UpdateGidmap(gidMapPath, s.cmd.Process.Pid, s.container.config.GIDMappings) +} + +// ParseNsExecSync runs the given callback function on each message received +// from the child. It will return once the child sends SYNC_RECVPID_PLS. +func ParseNsExecSync(r io.Reader, fn func(NsExecSyncMsg) error) error { + var ( + msg NsExecSyncMsg + buf [bufSize]byte + ) + + native := nl.NativeEndian() + + for { + if _, err := io.ReadAtLeast(r, buf[:], bufSize); err != nil { + return err + } + msg = NsExecSyncMsg(native.Uint32(buf[:])) + if err := fn(msg); err != nil { + return err + } + if msg == SyncRecvPidPls { + break + } + } + return nil +} + +// AckNsExecSync is used to send a message to the child. +func AckNsExecSync(f *os.File, msg NsExecSyncMsg) error { + var buf [bufSize]byte + native := nl.NativeEndian() + native.PutUint32(buf[:], uint32(msg)) + if _, err := unix.Write(int(f.Fd()), buf[:]); err != nil { + logrus.Debugf("failed to write message to nsexec: %v", err) + return err + } + return nil +} diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go index 68a5fd7bcd4..ed421bc26a2 100644 --- a/libcontainer/process_linux.go +++ b/libcontainer/process_linux.go @@ -104,6 +104,7 @@ type containerProcess struct { process *Process bootstrapData io.Reader container *Container + childPid int } func (p *containerProcess) pid() int { From 827233276b1730bb1c708c434a40f1879e3c9a8b Mon Sep 17 00:00:00 2001 From: lifubang Date: Fri, 21 Feb 2025 02:44:20 +0000 Subject: [PATCH 4/5] replace nsexec stage-0 c code with go implemention Signed-off-by: lifubang --- libcontainer/container_linux.go | 74 +---- libcontainer/init_linux.go | 5 - libcontainer/message_linux.go | 16 +- libcontainer/nsenter/log.c | 2 +- libcontainer/nsenter/nsenter_test.go | 88 ++++-- libcontainer/nsenter/nsexec.c | 411 ++------------------------- libcontainer/process_linux.go | 65 ++--- 7 files changed, 120 insertions(+), 541 deletions(-) diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index 642e0d482ab..4df57747fba 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -536,6 +536,10 @@ func (c *Container) newParentProcess(p *Process) (parentProcess, error) { cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITPIPE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1), ) + cmd.ExtraFiles = append(cmd.ExtraFiles, comm.stage1SockChild) + cmd.Env = append(cmd.Env, + "_LIBCONTAINER_STAGE1PIPE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1), + ) cmd.ExtraFiles = append(cmd.ExtraFiles, comm.syncSockChild.File()) cmd.Env = append(cmd.Env, "_LIBCONTAINER_SYNCPIPE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1), @@ -1022,17 +1026,6 @@ func (c *Container) orderNamespacePaths(namespaces map[configs.NamespaceType]str return paths, nil } -func encodeIDMapping(idMap []configs.IDMap) ([]byte, error) { - data := bytes.NewBuffer(nil) - for _, im := range idMap { - line := fmt.Sprintf("%d %d %d\n", im.ContainerID, im.HostID, im.Size) - if _, err := data.WriteString(line); err != nil { - return nil, err - } - } - return data.Bytes(), nil -} - // netlinkError is an error wrapper type for use by custom netlink message // types. Panics with errors are wrapped in netlinkError so that the recover // in bootstrapData can distinguish intentional panics. @@ -1079,59 +1072,6 @@ func (c *Container) bootstrapData(cloneFlags uintptr, nsMaps map[configs.Namespa }) } - // write namespace paths only when we are not joining an existing user ns - _, joinExistingUser := nsMaps[configs.NEWUSER] - if !joinExistingUser { - // write uid mappings - if len(c.config.UIDMappings) > 0 { - if c.config.RootlessEUID { - // We resolve the paths for new{u,g}idmap from - // the context of runc to avoid doing a path - // lookup in the nsexec context. - if path, err := exec.LookPath("newuidmap"); err == nil { - r.AddData(&Bytemsg{ - Type: UidmapPathAttr, - Value: []byte(path), - }) - } - } - b, err := encodeIDMapping(c.config.UIDMappings) - if err != nil { - return nil, err - } - r.AddData(&Bytemsg{ - Type: UidmapAttr, - Value: b, - }) - } - - // write gid mappings - if len(c.config.GIDMappings) > 0 { - b, err := encodeIDMapping(c.config.GIDMappings) - if err != nil { - return nil, err - } - r.AddData(&Bytemsg{ - Type: GidmapAttr, - Value: b, - }) - if c.config.RootlessEUID { - if path, err := exec.LookPath("newgidmap"); err == nil { - r.AddData(&Bytemsg{ - Type: GidmapPathAttr, - Value: []byte(path), - }) - } - } - if requiresRootOrMappingTool(c.config.GIDMappings) { - r.AddData(&Boolmsg{ - Type: SetgroupAttr, - Value: true, - }) - } - } - } - if c.config.OomScoreAdj != nil { // write oom_score_adj r.AddData(&Bytemsg{ @@ -1140,12 +1080,6 @@ func (c *Container) bootstrapData(cloneFlags uintptr, nsMaps map[configs.Namespa }) } - // write rootless - r.AddData(&Boolmsg{ - Type: RootlessEUIDAttr, - Value: c.config.RootlessEUID, - }) - // write boottime and monotonic time ns offsets. if c.config.TimeOffsets != nil { var offsetSpec bytes.Buffer diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go index f78e561755f..79b40ce7204 100644 --- a/libcontainer/init_linux.go +++ b/libcontainer/init_linux.go @@ -33,11 +33,6 @@ const ( initStandard initType = "standard" ) -type pid struct { - Pid int `json:"stage2_pid"` - PidFirstChild int `json:"stage1_pid"` -} - // network is an internal struct used to setup container networks. type network struct { configs.Network diff --git a/libcontainer/message_linux.go b/libcontainer/message_linux.go index 2790f018d06..995a201f853 100644 --- a/libcontainer/message_linux.go +++ b/libcontainer/message_linux.go @@ -11,17 +11,11 @@ import ( // list of known message types we want to send to bootstrap program // The number is randomly chosen to not conflict with known netlink types const ( - InitMsg uint16 = 62000 - CloneFlagsAttr uint16 = 27281 - NsPathsAttr uint16 = 27282 - UidmapAttr uint16 = 27283 - GidmapAttr uint16 = 27284 - SetgroupAttr uint16 = 27285 - OomScoreAdjAttr uint16 = 27286 - RootlessEUIDAttr uint16 = 27287 - UidmapPathAttr uint16 = 27288 - GidmapPathAttr uint16 = 27289 - TimeOffsetsAttr uint16 = 27290 + InitMsg uint16 = 62000 + CloneFlagsAttr uint16 = 27281 + NsPathsAttr uint16 = 27282 + OomScoreAdjAttr uint16 = 27286 + TimeOffsetsAttr uint16 = 27290 ) type Int32msg struct { diff --git a/libcontainer/nsenter/log.c b/libcontainer/nsenter/log.c index 086b539833c..39f1cc1ae3b 100644 --- a/libcontainer/nsenter/log.c +++ b/libcontainer/nsenter/log.c @@ -58,7 +58,7 @@ void write_log(int level, const char *format, ...) if (stage == NULL) goto out; } else { - ret = asprintf(&stage, "nsexec-%d", current_stage); + ret = asprintf(&stage, "nsexec-%d", current_stage + 1); if (ret < 0) { stage = NULL; goto out; diff --git a/libcontainer/nsenter/nsenter_test.go b/libcontainer/nsenter/nsenter_test.go index c0b4e9b47e4..09a892eb79a 100644 --- a/libcontainer/nsenter/nsenter_test.go +++ b/libcontainer/nsenter/nsenter_test.go @@ -2,6 +2,7 @@ package nsenter import ( "bytes" + "encoding/binary" "encoding/json" "errors" "fmt" @@ -16,9 +17,16 @@ import ( "golang.org/x/sys/unix" ) +type mockProcessParent struct { + childPid int +} + func TestNsenterValidPaths(t *testing.T) { args := []string{"nsenter-exec"} parent, child := newPipe(t) + syncParent, syncChild := newPipe(t) + + process, chErr := startMockProcessParent(syncParent) namespaces := []string{ // join pid ns of the current process @@ -27,8 +35,8 @@ func TestNsenterValidPaths(t *testing.T) { cmd := &exec.Cmd{ Path: os.Args[0], Args: args, - ExtraFiles: []*os.File{child}, - Env: []string{"_LIBCONTAINER_INITPIPE=3"}, + ExtraFiles: []*os.File{child, syncChild}, + Env: []string{"_LIBCONTAINER_INITPIPE=3", "_LIBCONTAINER_STAGE1PIPE=4"}, Stdout: os.Stdout, Stderr: os.Stderr, } @@ -56,12 +64,19 @@ func TestNsenterValidPaths(t *testing.T) { t.Fatalf("nsenter error: %v", err) } - reapChildren(t, parent) + if err := <-chErr; err != nil { + t.Fatal(err) + } + + reapChildren(t, process) } func TestNsenterInvalidPaths(t *testing.T) { args := []string{"nsenter-exec"} parent, child := newPipe(t) + syncParent, syncChild := newPipe(t) + + _, _ = startMockProcessParent(syncParent) namespaces := []string{ fmt.Sprintf("pid:/proc/%d/ns/pid", -1), @@ -69,8 +84,8 @@ func TestNsenterInvalidPaths(t *testing.T) { cmd := &exec.Cmd{ Path: os.Args[0], Args: args, - ExtraFiles: []*os.File{child}, - Env: []string{"_LIBCONTAINER_INITPIPE=3"}, + ExtraFiles: []*os.File{child, syncChild}, + Env: []string{"_LIBCONTAINER_INITPIPE=3", "_LIBCONTAINER_STAGE1PIPE=4"}, } if err := cmd.Start(); err != nil { @@ -100,6 +115,9 @@ func TestNsenterInvalidPaths(t *testing.T) { func TestNsenterIncorrectPathType(t *testing.T) { args := []string{"nsenter-exec"} parent, child := newPipe(t) + syncParent, syncChild := newPipe(t) + + _, _ = startMockProcessParent(syncParent) namespaces := []string{ fmt.Sprintf("net:/proc/%d/ns/pid", os.Getpid()), @@ -107,8 +125,8 @@ func TestNsenterIncorrectPathType(t *testing.T) { cmd := &exec.Cmd{ Path: os.Args[0], Args: args, - ExtraFiles: []*os.File{child}, - Env: []string{"_LIBCONTAINER_INITPIPE=3"}, + ExtraFiles: []*os.File{child, syncChild}, + Env: []string{"_LIBCONTAINER_INITPIPE=3", "_LIBCONTAINER_STAGE1PIPE=4"}, } if err := cmd.Start(); err != nil { @@ -139,6 +157,9 @@ func TestNsenterChildLogging(t *testing.T) { args := []string{"nsenter-exec"} parent, child := newPipe(t) logread, logwrite := newPipe(t) + syncParent, syncChild := newPipe(t) + + process, chErr := startMockProcessParent(syncParent) namespaces := []string{ // join pid ns of the current process @@ -147,8 +168,8 @@ func TestNsenterChildLogging(t *testing.T) { cmd := &exec.Cmd{ Path: os.Args[0], Args: args, - ExtraFiles: []*os.File{child, logwrite}, - Env: []string{"_LIBCONTAINER_INITPIPE=3", "_LIBCONTAINER_LOGPIPE=4"}, + ExtraFiles: []*os.File{child, syncChild, logwrite}, + Env: []string{"_LIBCONTAINER_INITPIPE=3", "_LIBCONTAINER_STAGE1PIPE=4", "_LIBCONTAINER_LOGPIPE=5"}, Stdout: os.Stdout, Stderr: os.Stderr, } @@ -178,7 +199,11 @@ func TestNsenterChildLogging(t *testing.T) { t.Fatalf("nsenter error: %v", err) } - reapChildren(t, parent) + if err := <-chErr; err != nil { + t.Fatal(err) + } + + reapChildren(t, process) } func init() { @@ -202,26 +227,37 @@ func newPipe(t *testing.T) (parent *os.File, child *os.File) { return } -func reapChildren(t *testing.T, parent *os.File) { - t.Helper() - decoder := json.NewDecoder(parent) - decoder.DisallowUnknownFields() - var pid struct { - Pid2 int `json:"stage2_pid"` - Pid1 int `json:"stage1_pid"` - } - if err := decoder.Decode(&pid); err != nil { - t.Fatal(err) - } +func startMockProcessParent(syncSock *os.File) (*mockProcessParent, chan error) { + process := &mockProcessParent{} + ch := make(chan error, 1) - // Reap children. - _, _ = unix.Wait4(pid.Pid1, nil, 0, nil) - _, _ = unix.Wait4(pid.Pid2, nil, 0, nil) + go (func() { + ch <- libcontainer.ParseNsExecSync(syncSock, func(msg libcontainer.NsExecSyncMsg) error { + if msg == libcontainer.SyncRecvPidPls { + var pid uint32 + if err := binary.Read(syncSock, nl.NativeEndian(), &pid); err != nil { + return err + } + process.childPid = int(pid) + return libcontainer.AckNsExecSync(syncSock, libcontainer.SyncRecvPidAck) + } + return nil + }) + })() + + return process, ch +} + +func reapChildren(t *testing.T, parent *mockProcessParent) { + t.Helper() // Sanity check. - if pid.Pid1 == 0 || pid.Pid2 == 0 { - t.Fatal("got pids:", pid) + if parent.childPid <= 0 { + t.Fatal("got pid:", parent.childPid) } + + // Reap children. + _, _ = unix.Wait4(parent.childPid, nil, 0, nil) } func getLogs(t *testing.T, logread *os.File) { diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c index 607d495a263..af25e2aed48 100644 --- a/libcontainer/nsenter/nsexec.c +++ b/libcontainer/nsenter/nsexec.c @@ -37,17 +37,14 @@ enum sync_t { SYNC_USERMAP_ACK = 0x41, /* Mapping finished by the parent. */ SYNC_RECVPID_PLS = 0x42, /* Tell parent we're sending the PID. */ SYNC_RECVPID_ACK = 0x43, /* PID was correctly received by parent. */ - SYNC_GRANDCHILD = 0x44, /* The grandchild is ready to run. */ - SYNC_CHILD_FINISH = 0x45, /* The child or grandchild has finished. */ - SYNC_TIMEOFFSETS_PLS = 0x46, /* Request parent to write timens offsets. */ - SYNC_TIMEOFFSETS_ACK = 0x47, /* Timens offsets were written. */ + SYNC_TIMEOFFSETS_PLS = 0x44, /* Request parent to write timens offsets. */ + SYNC_TIMEOFFSETS_ACK = 0x45, /* Timens offsets were written. */ }; #define STAGE_SETUP -1 /* longjmp() arguments. */ -#define STAGE_PARENT 0 -#define STAGE_CHILD 1 -#define STAGE_INIT 2 +#define STAGE_CHILD 0 +#define STAGE_INIT 1 /* Stores the current stage of nsexec. */ int current_stage = STAGE_SETUP; @@ -102,13 +99,7 @@ struct nlconfig_t { #define INIT_MSG 62000 #define CLONE_FLAGS_ATTR 27281 #define NS_PATHS_ATTR 27282 -#define UIDMAP_ATTR 27283 -#define GIDMAP_ATTR 27284 -#define SETGROUP_ATTR 27285 #define OOM_SCORE_ADJ_ATTR 27286 -#define ROOTLESS_EUID_ATTR 27287 -#define UIDMAPPATH_ATTR 27288 -#define GIDMAPPATH_ATTR 27289 #define TIMENSOFFSET_ATTR 27290 /* @@ -132,9 +123,6 @@ int setns(int fd, int nstype) } #endif -/* XXX: This is ugly. */ -static int syncfd = -1; - static int write_file(char *data, size_t data_len, char *pathfmt, ...) { int fd, len, ret = 0; @@ -163,136 +151,6 @@ static int write_file(char *data, size_t data_len, char *pathfmt, ...) return ret; } -enum policy_t { - SETGROUPS_DEFAULT = 0, - SETGROUPS_ALLOW, - SETGROUPS_DENY, -}; - -/* This *must* be called before we touch gid_map. */ -static void update_setgroups(int pid, enum policy_t setgroup) -{ - char *policy; - - switch (setgroup) { - case SETGROUPS_ALLOW: - policy = "allow"; - break; - case SETGROUPS_DENY: - policy = "deny"; - break; - case SETGROUPS_DEFAULT: - default: - /* Nothing to do. */ - return; - } - - if (write_file(policy, strlen(policy), "/proc/%d/setgroups", pid) < 0) { - /* - * If the kernel is too old to support /proc/pid/setgroups, - * open(2) or write(2) will return ENOENT. This is fine. - */ - if (errno != ENOENT) - bail("failed to write '%s' to /proc/%d/setgroups", policy, pid); - } -} - -static int try_mapping_tool(const char *app, int pid, char *map, size_t map_len) -{ - int child; - - /* - * If @app is NULL, execve will segfault. Just check it here and bail (if - * we're in this path, the caller is already getting desperate and there - * isn't a backup to this failing). This usually would be a configuration - * or programming issue. - */ - if (!app) - bail("mapping tool not present"); - - child = fork(); - if (child < 0) - bail("failed to fork"); - - if (!child) { -#define MAX_ARGV 20 - char *argv[MAX_ARGV]; - char *envp[] = { NULL }; - char pid_fmt[16]; - int argc = 0; - char *next; - - snprintf(pid_fmt, 16, "%d", pid); - - argv[argc++] = (char *)app; - argv[argc++] = pid_fmt; - /* - * Convert the map string into a list of argument that - * newuidmap/newgidmap can understand. - */ - - while (argc < MAX_ARGV) { - if (*map == '\0') { - argv[argc++] = NULL; - break; - } - argv[argc++] = map; - next = strpbrk(map, "\n "); - if (next == NULL) - break; - *next++ = '\0'; - map = next + strspn(next, "\n "); - } - - execve(app, argv, envp); - bail("failed to execv"); - } else { - int status; - - while (true) { - if (waitpid(child, &status, 0) < 0) { - if (errno == EINTR) - continue; - bail("failed to waitpid"); - } - if (WIFEXITED(status) || WIFSIGNALED(status)) - return WEXITSTATUS(status); - } - } - - return -1; -} - -static void update_uidmap(const char *path, int pid, char *map, size_t map_len) -{ - if (map == NULL || map_len == 0) - return; - - write_log(DEBUG, "update /proc/%d/uid_map to '%s'", pid, map); - if (write_file(map, map_len, "/proc/%d/uid_map", pid) < 0) { - if (errno != EPERM) - bail("failed to update /proc/%d/uid_map", pid); - write_log(DEBUG, "update /proc/%d/uid_map got -EPERM (trying %s)", pid, path); - if (try_mapping_tool(path, pid, map, map_len)) - bail("failed to use newuid map on %d", pid); - } -} - -static void update_gidmap(const char *path, int pid, char *map, size_t map_len) -{ - if (map == NULL || map_len == 0) - return; - - write_log(DEBUG, "update /proc/%d/gid_map to '%s'", pid, map); - if (write_file(map, map_len, "/proc/%d/gid_map", pid) < 0) { - if (errno != EPERM) - bail("failed to update /proc/%d/gid_map", pid); - write_log(DEBUG, "update /proc/%d/gid_map got -EPERM (trying %s)", pid, path); - if (try_mapping_tool(path, pid, map, map_len)) - bail("failed to use newgid map on %d", pid); - } -} - static void update_oom_score_adj(char *data, size_t len) { if (data == NULL || len == 0) @@ -327,11 +185,6 @@ static uint32_t readint32(char *buf) return *(uint32_t *) buf; } -static uint8_t readint8(char *buf) -{ - return *(uint8_t *) buf; -} - static void nl_parse(int fd, struct nlconfig_t *config) { size_t len, size; @@ -373,9 +226,6 @@ static void nl_parse(int fd, struct nlconfig_t *config) case CLONE_FLAGS_ATTR: config->cloneflags = readint32(current); break; - case ROOTLESS_EUID_ATTR: - config->is_rootless_euid = readint8(current); /* boolean */ - break; case OOM_SCORE_ADJ_ATTR: config->oom_score_adj = current; config->oom_score_adj_len = payload_len; @@ -384,25 +234,6 @@ static void nl_parse(int fd, struct nlconfig_t *config) config->namespaces = current; config->namespaces_len = payload_len; break; - case UIDMAP_ATTR: - config->uidmap = current; - config->uidmap_len = payload_len; - break; - case GIDMAP_ATTR: - config->gidmap = current; - config->gidmap_len = payload_len; - break; - case UIDMAPPATH_ATTR: - config->uidmappath = current; - config->uidmappath_len = payload_len; - break; - case GIDMAPPATH_ATTR: - config->gidmappath = current; - config->gidmappath_len = payload_len; - break; - case SETGROUP_ATTR: - config->is_setgroup = readint8(current); - break; case TIMENSOFFSET_ATTR: config->timensoffset = current; config->timensoffset_len = payload_len; @@ -664,20 +495,10 @@ void try_unshare(int flags, const char *msg) bail("failed to unshare %s", msg); } -static void update_timens_offsets(pid_t pid, char *map, size_t map_len) -{ - if (map == NULL || map_len == 0) - return; - write_log(DEBUG, "update /proc/%d/timens_offsets to '%s'", pid, map); - if (write_file(map, map_len, "/proc/%d/timens_offsets", pid) < 0) - bail("failed to update /proc/%d/timens_offsets", pid); -} - void nsexec(void) { - int pipenum; + int pipenum, syncfd; jmp_buf env; - int sync_child_pipe[2], sync_grandchild_pipe[2]; struct nlconfig_t config = { 0 }; /* @@ -697,6 +518,17 @@ void nsexec(void) return; } + /* + * Get the stage1 pipe fd from the environment. The stage1 pipe is used to + * request the parent to do some operations that can't be done in the + * child process. + */ + syncfd = getenv_int("_LIBCONTAINER_STAGE1PIPE"); + if (syncfd < 0) { + /* We are not a runc init. Just return to go runtime. */ + return; + } + write_log(DEBUG, "=> nsexec container setup"); /* Parse all of the netlink configuration. */ @@ -726,17 +558,6 @@ void nsexec(void) bail("failed to set process as non-dumpable"); } - /* Pipe so we can tell the child when we've finished setting up. */ - if (socketpair(AF_LOCAL, SOCK_STREAM, 0, sync_child_pipe) < 0) - bail("failed to setup sync pipe between parent and child"); - - /* - * We need a new socketpair to sync with grandchild so we don't have - * race condition with child. - */ - if (socketpair(AF_LOCAL, SOCK_STREAM, 0, sync_grandchild_pipe) < 0) - bail("failed to setup sync pipe between parent and grandchild"); - /* TODO: Currently we aren't dealing with child deaths properly. */ /* @@ -787,162 +608,6 @@ void nsexec(void) */ switch (setjmp(env)) { - /* - * Stage 0: We're in the parent. Our job is just to create a new child - * (stage 1: STAGE_CHILD) process and write its uid_map and - * gid_map. That process will go on to create a new process, then - * it will send us its PID which we will send to the bootstrap - * process. - */ - case STAGE_PARENT:{ - int len; - pid_t stage1_pid = -1, stage2_pid = -1; - bool stage1_complete, stage2_complete; - - /* For debugging. */ - current_stage = STAGE_PARENT; - prctl(PR_SET_NAME, (unsigned long)"runc:[0:PARENT]", 0, 0, 0); - write_log(DEBUG, "~> nsexec stage-0"); - - /* Start the process of getting a container. */ - write_log(DEBUG, "spawn stage-1"); - stage1_pid = clone_parent(&env, STAGE_CHILD); - if (stage1_pid < 0) - bail("unable to spawn stage-1"); - - syncfd = sync_child_pipe[1]; - if (close(sync_child_pipe[0]) < 0) - bail("failed to close sync_child_pipe[0] fd"); - - /* - * State machine for synchronisation with the children. We only - * return once both the child and grandchild are ready. - */ - write_log(DEBUG, "-> stage-1 synchronisation loop"); - stage1_complete = false; - while (!stage1_complete) { - enum sync_t s; - - if (read(syncfd, &s, sizeof(s)) != sizeof(s)) - bail("failed to sync with stage-1: next state"); - - switch (s) { - case SYNC_USERMAP_PLS: - write_log(DEBUG, "stage-1 requested userns mappings"); - - /* - * Enable setgroups(2) if we've been asked to. But we also - * have to explicitly disable setgroups(2) if we're - * creating a rootless container for single-entry mapping. - * i.e. config.is_setgroup == false. - * (this is required since Linux 3.19). - * - * For rootless multi-entry mapping, config.is_setgroup shall be true and - * newuidmap/newgidmap shall be used. - */ - if (config.is_rootless_euid && !config.is_setgroup) - update_setgroups(stage1_pid, SETGROUPS_DENY); - - /* Set up mappings. */ - update_uidmap(config.uidmappath, stage1_pid, config.uidmap, config.uidmap_len); - update_gidmap(config.gidmappath, stage1_pid, config.gidmap, config.gidmap_len); - - s = SYNC_USERMAP_ACK; - if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { - sane_kill(stage1_pid, SIGKILL); - sane_kill(stage2_pid, SIGKILL); - bail("failed to sync with stage-1: write(SYNC_USERMAP_ACK)"); - } - break; - case SYNC_RECVPID_PLS: - write_log(DEBUG, "stage-1 requested pid to be forwarded"); - - /* Get the stage-2 pid. */ - if (read(syncfd, &stage2_pid, sizeof(stage2_pid)) != sizeof(stage2_pid)) { - sane_kill(stage1_pid, SIGKILL); - bail("failed to sync with stage-1: read(stage2_pid)"); - } - - /* Send ACK. */ - s = SYNC_RECVPID_ACK; - if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { - sane_kill(stage1_pid, SIGKILL); - sane_kill(stage2_pid, SIGKILL); - bail("failed to sync with stage-1: write(SYNC_RECVPID_ACK)"); - } - - /* - * Send both the stage-1 and stage-2 pids back to runc. - * runc needs the stage-2 to continue process management, - * but because stage-1 was spawned with CLONE_PARENT we - * cannot reap it within stage-0 and thus we need to ask - * runc to reap the zombie for us. - */ - write_log(DEBUG, "forward stage-1 (%d) and stage-2 (%d) pids to runc", - stage1_pid, stage2_pid); - len = - dprintf(pipenum, "{\"stage1_pid\":%d,\"stage2_pid\":%d}\n", stage1_pid, - stage2_pid); - if (len < 0) { - sane_kill(stage1_pid, SIGKILL); - sane_kill(stage2_pid, SIGKILL); - bail("failed to sync with runc: write(pid-JSON)"); - } - break; - case SYNC_TIMEOFFSETS_PLS: - write_log(DEBUG, "stage-1 requested timens offsets to be configured"); - update_timens_offsets(stage1_pid, config.timensoffset, config.timensoffset_len); - s = SYNC_TIMEOFFSETS_ACK; - if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { - sane_kill(stage1_pid, SIGKILL); - bail("failed to sync with child: write(SYNC_TIMEOFFSETS_ACK)"); - } - break; - case SYNC_CHILD_FINISH: - write_log(DEBUG, "stage-1 complete"); - stage1_complete = true; - break; - default: - bail("unexpected sync value: %u", s); - } - } - write_log(DEBUG, "<- stage-1 synchronisation loop"); - - /* Now sync with grandchild. */ - syncfd = sync_grandchild_pipe[1]; - if (close(sync_grandchild_pipe[0]) < 0) - bail("failed to close sync_grandchild_pipe[0] fd"); - - write_log(DEBUG, "-> stage-2 synchronisation loop"); - stage2_complete = false; - while (!stage2_complete) { - enum sync_t s; - - write_log(DEBUG, "signalling stage-2 to run"); - s = SYNC_GRANDCHILD; - if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { - sane_kill(stage2_pid, SIGKILL); - bail("failed to sync with child: write(SYNC_GRANDCHILD)"); - } - - if (read(syncfd, &s, sizeof(s)) != sizeof(s)) - bail("failed to sync with child: next state"); - - switch (s) { - case SYNC_CHILD_FINISH: - write_log(DEBUG, "stage-2 complete"); - stage2_complete = true; - break; - default: - bail("unexpected sync value: %u", s); - } - } - write_log(DEBUG, "<- stage-2 synchronisation loop"); - write_log(DEBUG, "<~ nsexec stage-0"); - exit(0); - } - break; - /* * Stage 1: We're in the first child process. Our job is to join any * provided namespaces in the netlink payload and unshare all of @@ -959,11 +624,6 @@ void nsexec(void) /* For debugging. */ current_stage = STAGE_CHILD; - /* We're in a child and thus need to tell the parent if we die. */ - syncfd = sync_child_pipe[0]; - if (close(sync_child_pipe[1]) < 0) - bail("failed to close sync_child_pipe[1] fd"); - /* For debugging. */ prctl(PR_SET_NAME, (unsigned long)"runc:[1:CHILD]", 0, 0, 0); write_log(DEBUG, "~> nsexec stage-1"); @@ -1023,9 +683,9 @@ void nsexec(void) /* ... wait for mapping ... */ write_log(DEBUG, "waiting stage-0 to complete the mapping of user namespace"); if (read(syncfd, &s, sizeof(s)) != sizeof(s)) - bail("failed to sync with parent: read(SYNC_USERMAP_ACK)"); + bail("failed to sync with parent: read(SYNC_USERMAP_ACK) got %d", s); if (s != SYNC_USERMAP_ACK) - bail("failed to sync with parent: SYNC_USERMAP_ACK: got %u", s); + bail("failed to sync with parent: SYNC_USERMAP_ACK"); /* Revert temporary re-dumpable setting. */ if (config.namespaces) { @@ -1100,12 +760,7 @@ void nsexec(void) bail("failed to sync with parent: SYNC_RECVPID_ACK: got %u", s); } - write_log(DEBUG, "signal completion to stage-0"); - s = SYNC_CHILD_FINISH; - if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { - sane_kill(stage2_pid, SIGKILL); - bail("failed to sync with parent: write(SYNC_CHILD_FINISH)"); - } + close(syncfd); /* Our work is done. [Stage 2: STAGE_INIT] is doing the rest of the work. */ write_log(DEBUG, "<~ nsexec stage-1"); @@ -1120,32 +775,13 @@ void nsexec(void) * init_linux.go to run. */ case STAGE_INIT:{ - /* - * We're inside the child now, having jumped from the - * start_child() code after forking in the parent. - */ - enum sync_t s; - /* For debugging. */ current_stage = STAGE_INIT; - /* We're in a child and thus need to tell the parent if we die. */ - syncfd = sync_grandchild_pipe[0]; - if (close(sync_grandchild_pipe[1]) < 0) - bail("failed to close sync_grandchild_pipe[1] fd"); - - if (close(sync_child_pipe[0]) < 0) - bail("failed to close sync_child_pipe[0] fd"); - /* For debugging. */ prctl(PR_SET_NAME, (unsigned long)"runc:[2:INIT]", 0, 0, 0); write_log(DEBUG, "~> nsexec stage-2"); - if (read(syncfd, &s, sizeof(s)) != sizeof(s)) - bail("failed to sync with parent: read(SYNC_GRANDCHILD)"); - if (s != SYNC_GRANDCHILD) - bail("failed to sync with parent: SYNC_GRANDCHILD: got %u", s); - if (setsid() < 0) bail("setsid failed"); @@ -1160,14 +796,7 @@ void nsexec(void) bail("setgroups failed"); } - write_log(DEBUG, "signal completion to stage-0"); - s = SYNC_CHILD_FINISH; - if (write(syncfd, &s, sizeof(s)) != sizeof(s)) - bail("failed to sync with parent: write(SYNC_CHILD_FINISH)"); - - /* Close sync pipes. */ - if (close(sync_grandchild_pipe[0]) < 0) - bail("failed to close sync_grandchild_pipe[0] fd"); + close(syncfd); /* Free netlink data. */ nl_free(&config); diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go index ed421bc26a2..be7e035ff8a 100644 --- a/libcontainer/process_linux.go +++ b/libcontainer/process_linux.go @@ -55,6 +55,9 @@ type processComm struct { // indicate that it is ready. initSockParent *os.File initSockChild *os.File + // Used for control messages between parent and "runc init" stage-1 process. + stage1SockParent *os.File + stage1SockChild *os.File // Used for control messages between parent and "runc init". syncSockParent *syncSocket syncSockChild *syncSocket @@ -72,6 +75,10 @@ func newProcessComm() (*processComm, error) { if err != nil { return nil, fmt.Errorf("unable to create init pipe: %w", err) } + comm.stage1SockParent, comm.stage1SockChild, err = utils.NewSockPair("stage1") + if err != nil { + return nil, fmt.Errorf("unable to create stage1 pipe: %w", err) + } comm.syncSockParent, comm.syncSockChild, err = newSyncSockpair("sync") if err != nil { return nil, fmt.Errorf("unable to create sync pipe: %w", err) @@ -85,12 +92,14 @@ func newProcessComm() (*processComm, error) { func (c *processComm) closeChild() { _ = c.initSockChild.Close() + _ = c.stage1SockChild.Close() _ = c.syncSockChild.Close() _ = c.logPipeChild.Close() } func (c *processComm) closeParent() { _ = c.initSockParent.Close() + _ = c.stage1SockChild.Close() _ = c.syncSockParent.Close() // c.logPipeParent is kept alive for ForwardLogs } @@ -194,6 +203,13 @@ func (p *setnsProcess) start() (retErr error) { return fmt.Errorf("error copying bootstrap data to pipe: %w", err) } } + if err := p.setupNsExec(p.comm.stage1SockParent); err != nil { + return fmt.Errorf("error waiting nsexec report pid: %w", err) + } + if err := p.comm.stage1SockParent.Close(); err != nil { + return err + } + if err := p.execSetns(); err != nil { return fmt.Errorf("error executing setns process: %w", err) } @@ -332,20 +348,8 @@ func (p *setnsProcess) execSetns() error { _ = p.cmd.Wait() return &exec.ExitError{ProcessState: status} } - var pid *pid - if err := json.NewDecoder(p.comm.initSockParent).Decode(&pid); err != nil { - _ = p.cmd.Wait() - return fmt.Errorf("error reading pid from init pipe: %w", err) - } - - // Clean up the zombie parent process - // On Unix systems FindProcess always succeeds. - firstChildProcess, _ := os.FindProcess(pid.PidFirstChild) - // Ignore the error in case the child has already been reaped for any reason - _, _ = firstChildProcess.Wait() - - process, err := os.FindProcess(pid.Pid) + process, err := os.FindProcess(p.childPid) if err != nil { return err } @@ -359,24 +363,6 @@ type initProcess struct { intelRdtManager *intelrdt.Manager } -// getChildPid receives the final child's pid over the provided pipe. -func (p *initProcess) getChildPid() (int, error) { - var pid pid - if err := json.NewDecoder(p.comm.initSockParent).Decode(&pid); err != nil { - _ = p.cmd.Wait() - return -1, err - } - - // Clean up the zombie parent process - // On Unix systems FindProcess always succeeds. - firstChildProcess, _ := os.FindProcess(pid.PidFirstChild) - - // Ignore the error in case the child has already been reaped for any reason - _, _ = firstChildProcess.Wait() - - return pid.Pid, nil -} - func (p *initProcess) waitForChildExit(childPid int) error { status, err := p.cmd.Process.Wait() if err != nil { @@ -571,23 +557,28 @@ func (p *initProcess) start() (retErr error) { if _, err := io.Copy(p.comm.initSockParent, p.bootstrapData); err != nil { return fmt.Errorf("can't copy bootstrap data to pipe: %w", err) } + if err := p.setupNsExec(p.comm.stage1SockParent); err != nil { + return fmt.Errorf("error waiting nsexec report pid: %w", err) + } + if err := p.comm.stage1SockParent.Close(); err != nil { + return err + } - childPid, err := p.getChildPid() - if err != nil { - return fmt.Errorf("can't get final child's PID from pipe: %w", err) + if p.childPid <= 0 { + return fmt.Errorf("invalid child pid %d", p.childPid) } // Save the standard descriptor names before the container process // can potentially move them (e.g., via dup2()). If we don't do this now, // we won't know at checkpoint time which file descriptor to look up. - fds, err := getPipeFds(childPid) + fds, err := getPipeFds(p.childPid) if err != nil { - return fmt.Errorf("error getting pipe fds for pid %d: %w", childPid, err) + return fmt.Errorf("error getting pipe fds for pid %d: %w", p.childPid, err) } p.setExternalDescriptors(fds) // Wait for our first child to exit - if err := p.waitForChildExit(childPid); err != nil { + if err := p.waitForChildExit(p.childPid); err != nil { return fmt.Errorf("error waiting for our first child to exit: %w", err) } From 067dc3c90bb68d13051e6cc41ccb77d1097677be Mon Sep 17 00:00:00 2001 From: lifubang Date: Fri, 21 Feb 2025 02:49:51 +0000 Subject: [PATCH 5/5] replace some nsexec stage-2 c code with go implemention Signed-off-by: lifubang --- libcontainer/init_linux.go | 18 ++++++++++++++++++ libcontainer/nsenter/nsexec.c | 14 -------------- 2 files changed, 18 insertions(+), 14 deletions(-) diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go index 79b40ce7204..9a6c9129c7b 100644 --- a/libcontainer/init_linux.go +++ b/libcontainer/init_linux.go @@ -222,6 +222,24 @@ func startInitialization() (retErr error) { return err } + if _, err := unix.Setsid(); err != nil { + return os.NewSyscallError("setsid", err) + } + + if err := unix.Setuid(0); err != nil { + return os.NewSyscallError("setuid", err) + } + + if err := unix.Setgid(0); err != nil { + return os.NewSyscallError("setgid", err) + } + + if !config.Config.RootlessEUID && requiresRootOrMappingTool(config.Config.GIDMappings) { + if err := unix.Setgroups([]int{0}); err != nil { + return os.NewSyscallError("setgroups", err) + } + } + // If init succeeds, it will not return, hence none of the defers will be called. return containerInit(it, &config, syncPipe, consoleSocket, pidfdSocket, fifoFile, logPipe) } diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c index af25e2aed48..edc79612dab 100644 --- a/libcontainer/nsenter/nsexec.c +++ b/libcontainer/nsenter/nsexec.c @@ -782,20 +782,6 @@ void nsexec(void) prctl(PR_SET_NAME, (unsigned long)"runc:[2:INIT]", 0, 0, 0); write_log(DEBUG, "~> nsexec stage-2"); - if (setsid() < 0) - bail("setsid failed"); - - if (setuid(0) < 0) - bail("setuid failed"); - - if (setgid(0) < 0) - bail("setgid failed"); - - if (!config.is_rootless_euid && config.is_setgroup) { - if (setgroups(0, NULL) < 0) - bail("setgroups failed"); - } - close(syncfd); /* Free netlink data. */