Skip to content

Commit 686c694

Browse files
corherelaurazard
authored andcommitted
runc-shim: refuse to start execs after init exits
The runc task state machine prevents execs from being created after the init process has exited, but there are no guards against starting a created exec after the init process has exited. That leaves a small window for starting an exec to race our handling of the init process exiting. Normally this is not an issue in practice: the kernel will atomically kill all processes in a PID namespace when its "init" process terminates, and will not allow new processes to fork(2) into the PID namespace afterwards. Therefore the racing exec is guaranteed by the kernel to not be running after the init process terminates. On the other hand, when the container does not have a private PID namespace (i.e. the container's init process is not the "init" process of the container's PID namespace), the kernel does not automatically kill other container processes on init exit and will happily allow runc to start an exec process at any time. It is the runc shim's responsibility to clean up the container when the init process exits in this situation by killing all the container's remaining processes. Block execs from being started after the container's init process has exited to prevent the processes from leaking, and to avoid violating the task service's assumption that an exec can be running iff the init process is also running. Signed-off-by: Cory Snider <[email protected]> (cherry picked from commit e735791) Signed-off-by: Laura Brehm <[email protected]>
1 parent 760935e commit 686c694

File tree

1 file changed

+17
-0
lines changed

1 file changed

+17
-0
lines changed

runtime/v2/runc/task/service.go

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@ func NewTaskService(ctx context.Context, publisher shim.Publisher, sd shutdown.S
8181
containers: make(map[string]*runc.Container),
8282
running: make(map[int][]containerProcess),
8383
pendingExecs: make(map[*runc.Container]int),
84+
execable: make(map[*runc.Container]bool),
8485
exitSubscribers: make(map[*map[int][]runcC.Exit]struct{}),
8586
}
8687
go s.processExits()
@@ -117,6 +118,12 @@ type service struct {
117118
lifecycleMu sync.Mutex
118119
running map[int][]containerProcess // pid -> running process, guarded by lifecycleMu
119120
pendingExecs map[*runc.Container]int // container -> num pending execs, guarded by lifecycleMu
121+
// container -> execs can be started, guarded by lifecycleMu.
122+
// Execs can be started if the container's init process (read: pid, not [process.Init])
123+
// has been started and not yet reaped by the shim.
124+
// Note that this flag gets updated before the container's [process.Init.Status]
125+
// is transitioned to "stopped".
126+
execable map[*runc.Container]bool
120127
// Subscriptions to exits for PIDs. Adding/deleting subscriptions and
121128
// dereferencing the subscription pointers must only be done while holding
122129
// lifecycleMu.
@@ -230,6 +237,9 @@ func (s *service) preStart(c *runc.Container) (handleStarted func(*runc.Containe
230237
Container: c,
231238
Process: p,
232239
})
240+
if init {
241+
s.execable[c] = true
242+
}
233243
s.lifecycleMu.Unlock()
234244
}
235245
}
@@ -304,6 +314,10 @@ func (s *service) Start(ctx context.Context, r *taskAPI.StartRequest) (*taskAPI.
304314
if r.ExecID == "" {
305315
cinit = container
306316
} else {
317+
if !s.execable[container] {
318+
s.lifecycleMu.Unlock()
319+
return nil, errdefs.ToGRPCf(errdefs.ErrFailedPrecondition, "container %s init process is not running", container.ID)
320+
}
307321
s.pendingExecs[container]++
308322
}
309323
handleStarted, cleanup := s.preStart(cinit)
@@ -679,6 +693,9 @@ func (s *service) processExits() {
679693
var cps, skipped []containerProcess
680694
for _, cp := range s.running[e.Pid] {
681695
_, init := cp.Process.(*process.Init)
696+
if init {
697+
delete(s.execable, cp.Container)
698+
}
682699
if init && s.pendingExecs[cp.Container] != 0 {
683700
// This exit relates to a container for which we have pending execs. In
684701
// order to ensure order between execs and the init process for a given

0 commit comments

Comments
 (0)