Skip to content

Commit c10a85d

Browse files
committed
libct: use pidfd and epoll to wait the init process exit
Signed-off-by: lifubang <[email protected]>
1 parent 7483452 commit c10a85d

File tree

4 files changed

+103
-15
lines changed

4 files changed

+103
-15
lines changed

delete.go

Lines changed: 6 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -5,23 +5,16 @@ import (
55
"fmt"
66
"os"
77
"path/filepath"
8-
"time"
98

109
"github.com/opencontainers/runc/libcontainer"
1110
"github.com/urfave/cli"
12-
13-
"golang.org/x/sys/unix"
1411
)
1512

16-
func killContainer(container *libcontainer.Container) error {
17-
_ = container.Signal(unix.SIGKILL)
18-
for range 100 {
19-
time.Sleep(100 * time.Millisecond)
20-
if err := container.Signal(unix.Signal(0)); err != nil {
21-
return container.Destroy()
22-
}
13+
func killAndDestroy(container *libcontainer.Container) error {
14+
if err := container.EnsureKilled(); err != nil {
15+
return err
2316
}
24-
return errors.New("container init still running")
17+
return container.Destroy()
2518
}
2619

2720
var deleteCommand = cli.Command{
@@ -71,7 +64,7 @@ status of "ubuntu01" as "stopped" the following will delete resources held for
7164
// namespace) there may be some leftover processes in the
7265
// container's cgroup.
7366
if force {
74-
return killContainer(container)
67+
return killAndDestroy(container)
7568
}
7669
s, err := container.Status()
7770
if err != nil {
@@ -81,7 +74,7 @@ status of "ubuntu01" as "stopped" the following will delete resources held for
8174
case libcontainer.Stopped:
8275
return container.Destroy()
8376
case libcontainer.Created:
84-
return killContainer(container)
77+
return killAndDestroy(container)
8578
default:
8679
return fmt.Errorf("cannot delete container %s that is not stopped: %s", id, s)
8780
}

internal/linux/linux.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,3 +72,14 @@ func Sendmsg(fd int, p, oob []byte, to unix.Sockaddr, flags int) error {
7272
})
7373
return os.NewSyscallError("sendmsg", err)
7474
}
75+
76+
// EpollWait wraps [unix.EpollWait].
77+
func EpollWait(epfd int, events []unix.EpollEvent, msec int) (n int, err error) {
78+
n, err = retryOnEINTR2(func() (int, error) {
79+
return unix.EpollWait(epfd, events, msec)
80+
})
81+
if err != nil {
82+
return 0, os.NewSyscallError("epollwait", err)
83+
}
84+
return n, nil
85+
}

libcontainer/README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,9 @@ container.Resume()
230230
// send signal to container's init process.
231231
container.Signal(signal)
232232

233+
// send signal to container's init process and waits for the kernel to finish killing it.
234+
container.EnsureKilled()
235+
233236
// update container resource constraints.
234237
container.Set(config)
235238

libcontainer/container_linux.go

Lines changed: 83 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121
"golang.org/x/sys/unix"
2222

2323
"github.com/opencontainers/cgroups"
24+
"github.com/opencontainers/runc/internal/linux"
2425
"github.com/opencontainers/runc/libcontainer/configs"
2526
"github.com/opencontainers/runc/libcontainer/exeseal"
2627
"github.com/opencontainers/runc/libcontainer/intelrdt"
@@ -377,9 +378,13 @@ func (c *Container) start(process *Process) (retErr error) {
377378

378379
// Signal sends a specified signal to container's init.
379380
//
380-
// When s is SIGKILL and the container does not have its own PID namespace, all
381-
// the container's processes are killed. In this scenario, the libcontainer
381+
// When s is SIGKILL:
382+
// 1. If the container does not have its own PID namespace, all the
383+
// container's processes are killed. In this scenario, the libcontainer
382384
// user may be required to implement a proper child reaper.
385+
// 2. Otherwise, we just send the SIGKILL signal to the init process,
386+
// but we don't wait for the init process to disappear. If you want to
387+
// wait, please use c.EnsureKilled instead.
383388
func (c *Container) Signal(s os.Signal) error {
384389
c.m.Lock()
385390
defer c.m.Unlock()
@@ -431,6 +436,82 @@ func (c *Container) signal(s os.Signal) error {
431436
return nil
432437
}
433438

439+
func (c *Container) killViaPidfd() error {
440+
c.m.Lock()
441+
defer c.m.Unlock()
442+
443+
// To avoid a PID reuse attack, don't kill non-running container.
444+
if !c.hasInit() {
445+
return ErrNotRunning
446+
}
447+
448+
pidfd, err := unix.PidfdOpen(c.initProcess.pid(), 0)
449+
if err != nil {
450+
return err
451+
}
452+
defer unix.Close(pidfd)
453+
454+
epollfd, err := unix.EpollCreate1(unix.EPOLL_CLOEXEC)
455+
if err != nil {
456+
return err
457+
}
458+
defer unix.Close(epollfd)
459+
460+
event := unix.EpollEvent{
461+
Events: unix.EPOLLIN,
462+
Fd: int32(pidfd),
463+
}
464+
if err := unix.EpollCtl(epollfd, unix.EPOLL_CTL_ADD, pidfd, &event); err != nil {
465+
return err
466+
}
467+
468+
if err := unix.PidfdSendSignal(pidfd, unix.SIGKILL, nil, 0); err != nil {
469+
return err
470+
}
471+
472+
events := make([]unix.EpollEvent, 1)
473+
// Set the timeout to 10s, the same as in kill below.
474+
n, err := linux.EpollWait(epollfd, events, 10000)
475+
if err != nil {
476+
return err
477+
}
478+
if n > 0 {
479+
for i := range n {
480+
event := events[i]
481+
if event.Fd == int32(pidfd) {
482+
return nil
483+
}
484+
}
485+
}
486+
return errors.New("container init still running")
487+
}
488+
489+
func (c *Container) kill() error {
490+
_ = c.Signal(unix.SIGKILL)
491+
for i := 0; i < 100; i++ {
492+
time.Sleep(100 * time.Millisecond)
493+
if err := c.Signal(unix.Signal(0)); err != nil {
494+
return nil
495+
}
496+
}
497+
return errors.New("container init still running")
498+
}
499+
500+
// EnsureKilled kills the container and waits for the kernel to finish killing it.
501+
func (c *Container) EnsureKilled() error {
502+
// When a container doesn't have a private pidns, we have to kill all processes
503+
// in the cgroup, it's more simpler to use `cgroup.kill` or `unix.Kill`.
504+
if c.config.Namespaces.IsPrivate(configs.NEWPID) {
505+
var err error
506+
if err = c.killViaPidfd(); err == nil {
507+
return nil
508+
}
509+
510+
logrus.Debugf("pidfd & epoll failed, falling back to unix.Signal: %v", err)
511+
}
512+
return c.kill()
513+
}
514+
434515
func (c *Container) createExecFifo() (retErr error) {
435516
rootuid, err := c.config.HostRootUID()
436517
if err != nil {

0 commit comments

Comments
 (0)