Skip to content

Commit 06fa617

Browse files
committed
Lock pod while starting and stopping containers
The intention behind this is to stop races between `pod stop|start` and `container stop|start` being run at the same time. This could result in containers with no working network (they join the still-running infra container's netns, which is then torn down as the infra container is stopped, leaving the container in an otherwise unused, nonfunctional, orphan netns. Locking the pod (if present) in the public container start and stop APIs should be sufficient to stop this. Signed-off-by: Matt Heon <[email protected]>
1 parent be5d807 commit 06fa617

File tree

3 files changed

+64
-26
lines changed

3 files changed

+64
-26
lines changed

libpod/container_api.go

Lines changed: 29 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -85,34 +85,23 @@ func (c *Container) initUnlocked(ctx context.Context, recursive bool) error {
8585
// Start requires that all dependency containers (e.g. pod infra containers) are
8686
// running before starting the container. The recursive parameter, if set, will start all
8787
// dependencies before starting this container.
88-
func (c *Container) Start(ctx context.Context, recursive bool) (finalErr error) {
89-
if !c.batched {
90-
c.lock.Lock()
91-
defer c.lock.Unlock()
92-
93-
// defer's are executed LIFO so we are locked here
94-
// as long as we call this after the defer unlock()
95-
defer func() {
96-
if finalErr != nil {
97-
if err := saveContainerError(c, finalErr); err != nil {
98-
logrus.Debug(err)
99-
}
100-
}
101-
}()
102-
103-
if err := c.syncContainer(); err != nil {
104-
return err
88+
func (c *Container) Start(ctx context.Context, recursive bool) error {
89+
// Have to lock the pod the container is a part of.
90+
// This prevents running `podman start` at the same time a
91+
// `podman pod stop` is running, which could lead to wierd races.
92+
// Pod locks come before container locks, so do this first.
93+
if c.config.Pod != "" {
94+
// If we get an error, the pod was probably removed.
95+
// So we get an expected ErrCtrRemoved instead of ErrPodRemoved,
96+
// just ignore this and move on to syncing the container.
97+
pod, _ := c.runtime.state.Pod(c.config.Pod)
98+
if pod != nil {
99+
pod.lock.Lock()
100+
defer pod.lock.Unlock()
105101
}
106102
}
107-
if err := c.prepareToStart(ctx, recursive); err != nil {
108-
return err
109-
}
110103

111-
// Start the container
112-
if err := c.start(); err != nil {
113-
return err
114-
}
115-
return c.waitForHealthy(ctx)
104+
return c.startNoPodLock(ctx, recursive)
116105
}
117106

118107
// Update updates the given container.
@@ -294,6 +283,21 @@ func (c *Container) Stop() error {
294283
// manually. If timeout is 0, SIGKILL will be used immediately to kill the
295284
// container.
296285
func (c *Container) StopWithTimeout(timeout uint) (finalErr error) {
286+
// Have to lock the pod the container is a part of.
287+
// This prevents running `podman stop` at the same time a
288+
// `podman pod start` is running, which could lead to wierd races.
289+
// Pod locks come before container locks, so do this first.
290+
if c.config.Pod != "" {
291+
// If we get an error, the pod was probably removed.
292+
// So we get an expected ErrCtrRemoved instead of ErrPodRemoved,
293+
// just ignore this and move on to syncing the container.
294+
pod, _ := c.runtime.state.Pod(c.config.Pod)
295+
if pod != nil {
296+
pod.lock.Lock()
297+
defer pod.lock.Unlock()
298+
}
299+
}
300+
297301
if !c.batched {
298302
c.lock.Lock()
299303
defer c.lock.Unlock()

libpod/container_internal.go

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1260,6 +1260,40 @@ func (c *Container) initAndStart(ctx context.Context) (retErr error) {
12601260
return c.waitForHealthy(ctx)
12611261
}
12621262

1263+
// Internal function to start a container without taking the pod lock.
1264+
// Please note that this DOES take the container lock.
1265+
// Intended to be used in pod-related functions.
1266+
func (c *Container) startNoPodLock(ctx context.Context, recursive bool) (finalErr error) {
1267+
if !c.batched {
1268+
c.lock.Lock()
1269+
defer c.lock.Unlock()
1270+
1271+
// defer's are executed LIFO so we are locked here
1272+
// as long as we call this after the defer unlock()
1273+
defer func() {
1274+
if finalErr != nil {
1275+
if err := saveContainerError(c, finalErr); err != nil {
1276+
logrus.Debug(err)
1277+
}
1278+
}
1279+
}()
1280+
1281+
if err := c.syncContainer(); err != nil {
1282+
return err
1283+
}
1284+
}
1285+
1286+
if err := c.prepareToStart(ctx, recursive); err != nil {
1287+
return err
1288+
}
1289+
1290+
// Start the container
1291+
if err := c.start(); err != nil {
1292+
return err
1293+
}
1294+
return c.waitForHealthy(ctx)
1295+
}
1296+
12631297
// Internal, non-locking function to start a container
12641298
func (c *Container) start() error {
12651299
if c.config.Spec.Process != nil {

libpod/pod_api.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ func (p *Pod) startInitContainers(ctx context.Context) error {
2424
}
2525
// Now iterate init containers
2626
for _, initCon := range initCtrs {
27-
if err := initCon.Start(ctx, true); err != nil {
27+
if err := initCon.startNoPodLock(ctx, true); err != nil {
2828
return err
2929
}
3030
// Check that the init container waited correctly and the exit

0 commit comments

Comments
 (0)