From c10a85d615644f7257e4c36a789325201949d835 Mon Sep 17 00:00:00 2001 From: lfbzhm Date: Fri, 8 Nov 2024 06:54:08 +0000 Subject: [PATCH 1/2] libct: use pidfd and epoll to wait the init process exit Signed-off-by: lifubang --- delete.go | 19 +++----- internal/linux/linux.go | 11 +++++ libcontainer/README.md | 3 ++ libcontainer/container_linux.go | 85 ++++++++++++++++++++++++++++++++- 4 files changed, 103 insertions(+), 15 deletions(-) diff --git a/delete.go b/delete.go index dcb19be1b94..fe320706cc5 100644 --- a/delete.go +++ b/delete.go @@ -5,23 +5,16 @@ import ( "fmt" "os" "path/filepath" - "time" "github.com/opencontainers/runc/libcontainer" "github.com/urfave/cli" - - "golang.org/x/sys/unix" ) -func killContainer(container *libcontainer.Container) error { - _ = container.Signal(unix.SIGKILL) - for range 100 { - time.Sleep(100 * time.Millisecond) - if err := container.Signal(unix.Signal(0)); err != nil { - return container.Destroy() - } +func killAndDestroy(container *libcontainer.Container) error { + if err := container.EnsureKilled(); err != nil { + return err } - return errors.New("container init still running") + return container.Destroy() } var deleteCommand = cli.Command{ @@ -71,7 +64,7 @@ status of "ubuntu01" as "stopped" the following will delete resources held for // namespace) there may be some leftover processes in the // container's cgroup. if force { - return killContainer(container) + return killAndDestroy(container) } s, err := container.Status() if err != nil { @@ -81,7 +74,7 @@ status of "ubuntu01" as "stopped" the following will delete resources held for case libcontainer.Stopped: return container.Destroy() case libcontainer.Created: - return killContainer(container) + return killAndDestroy(container) default: return fmt.Errorf("cannot delete container %s that is not stopped: %s", id, s) } diff --git a/internal/linux/linux.go b/internal/linux/linux.go index 9b049647903..848d4dfa380 100644 --- a/internal/linux/linux.go +++ b/internal/linux/linux.go @@ -72,3 +72,14 @@ func Sendmsg(fd int, p, oob []byte, to unix.Sockaddr, flags int) error { }) return os.NewSyscallError("sendmsg", err) } + +// EpollWait wraps [unix.EpollWait]. +func EpollWait(epfd int, events []unix.EpollEvent, msec int) (n int, err error) { + n, err = retryOnEINTR2(func() (int, error) { + return unix.EpollWait(epfd, events, msec) + }) + if err != nil { + return 0, os.NewSyscallError("epollwait", err) + } + return n, nil +} diff --git a/libcontainer/README.md b/libcontainer/README.md index 901351edb71..985abd699e6 100644 --- a/libcontainer/README.md +++ b/libcontainer/README.md @@ -230,6 +230,9 @@ container.Resume() // send signal to container's init process. container.Signal(signal) +// send signal to container's init process and waits for the kernel to finish killing it. +container.EnsureKilled() + // update container resource constraints. container.Set(config) diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index cfb3ccf5cd1..8ea789b5903 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -21,6 +21,7 @@ import ( "golang.org/x/sys/unix" "github.com/opencontainers/cgroups" + "github.com/opencontainers/runc/internal/linux" "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/exeseal" "github.com/opencontainers/runc/libcontainer/intelrdt" @@ -377,9 +378,13 @@ func (c *Container) start(process *Process) (retErr error) { // Signal sends a specified signal to container's init. // -// When s is SIGKILL and the container does not have its own PID namespace, all -// the container's processes are killed. In this scenario, the libcontainer +// When s is SIGKILL: +// 1. If the container does not have its own PID namespace, all the +// container's processes are killed. In this scenario, the libcontainer // user may be required to implement a proper child reaper. +// 2. Otherwise, we just send the SIGKILL signal to the init process, +// but we don't wait for the init process to disappear. If you want to +// wait, please use c.EnsureKilled instead. func (c *Container) Signal(s os.Signal) error { c.m.Lock() defer c.m.Unlock() @@ -431,6 +436,82 @@ func (c *Container) signal(s os.Signal) error { return nil } +func (c *Container) killViaPidfd() error { + c.m.Lock() + defer c.m.Unlock() + + // To avoid a PID reuse attack, don't kill non-running container. + if !c.hasInit() { + return ErrNotRunning + } + + pidfd, err := unix.PidfdOpen(c.initProcess.pid(), 0) + if err != nil { + return err + } + defer unix.Close(pidfd) + + epollfd, err := unix.EpollCreate1(unix.EPOLL_CLOEXEC) + if err != nil { + return err + } + defer unix.Close(epollfd) + + event := unix.EpollEvent{ + Events: unix.EPOLLIN, + Fd: int32(pidfd), + } + if err := unix.EpollCtl(epollfd, unix.EPOLL_CTL_ADD, pidfd, &event); err != nil { + return err + } + + if err := unix.PidfdSendSignal(pidfd, unix.SIGKILL, nil, 0); err != nil { + return err + } + + events := make([]unix.EpollEvent, 1) + // Set the timeout to 10s, the same as in kill below. + n, err := linux.EpollWait(epollfd, events, 10000) + if err != nil { + return err + } + if n > 0 { + for i := range n { + event := events[i] + if event.Fd == int32(pidfd) { + return nil + } + } + } + return errors.New("container init still running") +} + +func (c *Container) kill() error { + _ = c.Signal(unix.SIGKILL) + for i := 0; i < 100; i++ { + time.Sleep(100 * time.Millisecond) + if err := c.Signal(unix.Signal(0)); err != nil { + return nil + } + } + return errors.New("container init still running") +} + +// EnsureKilled kills the container and waits for the kernel to finish killing it. +func (c *Container) EnsureKilled() error { + // When a container doesn't have a private pidns, we have to kill all processes + // in the cgroup, it's more simpler to use `cgroup.kill` or `unix.Kill`. + if c.config.Namespaces.IsPrivate(configs.NEWPID) { + var err error + if err = c.killViaPidfd(); err == nil { + return nil + } + + logrus.Debugf("pidfd & epoll failed, falling back to unix.Signal: %v", err) + } + return c.kill() +} + func (c *Container) createExecFifo() (retErr error) { rootuid, err := c.config.HostRootUID() if err != nil { From e22ebcd50808c9d01e79d6331aa7517c6dffc3e0 Mon Sep 17 00:00:00 2001 From: Abel Feng Date: Tue, 5 Nov 2024 14:38:41 +0800 Subject: [PATCH 2/2] libct: reduce the delete delay When using unix.Kill to kill the container, we need a for loop to detect the init process exited or not manually, we sleep 100ms each time in the current, but for stopped containers or containers running in a low load machine, we don't need to wait so long time. This change will reduce the delete delay in some situations, especially for those pods with many containers in. Co-authored-by: Abel Feng Signed-off-by: lifubang --- delete.go | 31 +++++++++++++------------------ libcontainer/container_linux.go | 9 +++++++++ 2 files changed, 22 insertions(+), 18 deletions(-) diff --git a/delete.go b/delete.go index fe320706cc5..25ce2b494c6 100644 --- a/delete.go +++ b/delete.go @@ -10,13 +10,6 @@ import ( "github.com/urfave/cli" ) -func killAndDestroy(container *libcontainer.Container) error { - if err := container.EnsureKilled(); err != nil { - return err - } - return container.Destroy() -} - var deleteCommand = cli.Command{ Name: "delete", Usage: "delete any resources held by the container often used with detached container", @@ -58,25 +51,27 @@ status of "ubuntu01" as "stopped" the following will delete resources held for } return err } - // When --force is given, we kill all container processes and - // then destroy the container. This is done even for a stopped - // container, because (in case it does not have its own PID - // namespace) there may be some leftover processes in the - // container's cgroup. - if force { - return killAndDestroy(container) - } s, err := container.Status() if err != nil { return err } switch s { case libcontainer.Stopped: - return container.Destroy() + // If the container is stopped, we can just destroy it. case libcontainer.Created: - return killAndDestroy(container) + if err := container.EnsureKilled(); err != nil { + return err + } default: - return fmt.Errorf("cannot delete container %s that is not stopped: %s", id, s) + if !force { + return fmt.Errorf("cannot delete container %s that is not stopped: %s", id, s) + } + // When --force is given, we kill all container processes and + // then destroy the container. + if err := container.EnsureKilled(); err != nil { + return err + } } + return container.Destroy() }, } diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index 8ea789b5903..652d531a55c 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -488,6 +488,15 @@ func (c *Container) killViaPidfd() error { func (c *Container) kill() error { _ = c.Signal(unix.SIGKILL) + + // For containers running in a low load machine, we only need to wait about 1ms. + time.Sleep(time.Millisecond) + if err := c.Signal(unix.Signal(0)); err != nil { + return nil + } + + // For some containers in a heavy load machine, we need to wait more time. + logrus.Debugln("We need more time to wait the init process exit.") for i := 0; i < 100; i++ { time.Sleep(100 * time.Millisecond) if err := c.Signal(unix.Signal(0)); err != nil {