Skip to content

Commit 46d874a

Browse files
committed
Refactor graph traversal & use for pod stop
First, refactor our existing graph traversal code to improve code sharing. There still isn't much sharing between inward traversal (stop, remove) and outward traversal (start) but stop and remove are sharing most of their code, which seems a positive. Second, add a new graph-traversal function to stop containers. We already had start and remove; stop uses the newly-refactored inward-traversal code which it shares with removal. Third, rework the shared stop/removal inward-traversal code to add locking. This allows parallel execution of stop and removal, which should improve the performance of `podman pod rm` and retain the performance of `podman pod stop` at about what it is right now. Fourth and finally, use the new graph-based stop when possible to solve unordered stop problems with pods - specifically, the infra container stopping before application containers, leaving those containers without a working network. Fixes https://issues.redhat.com/browse/RHEL-76827 Signed-off-by: Matt Heon <[email protected]>
1 parent 06fa617 commit 46d874a

File tree

6 files changed

+313
-140
lines changed

6 files changed

+313
-140
lines changed

libpod/container_api.go

Lines changed: 3 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ func (c *Container) initUnlocked(ctx context.Context, recursive bool) error {
8888
func (c *Container) Start(ctx context.Context, recursive bool) error {
8989
// Have to lock the pod the container is a part of.
9090
// This prevents running `podman start` at the same time a
91-
// `podman pod stop` is running, which could lead to wierd races.
91+
// `podman pod stop` is running, which could lead to weird races.
9292
// Pod locks come before container locks, so do this first.
9393
if c.config.Pod != "" {
9494
// If we get an error, the pod was probably removed.
@@ -285,7 +285,7 @@ func (c *Container) Stop() error {
285285
func (c *Container) StopWithTimeout(timeout uint) (finalErr error) {
286286
// Have to lock the pod the container is a part of.
287287
// This prevents running `podman stop` at the same time a
288-
// `podman pod start` is running, which could lead to wierd races.
288+
// `podman pod start` is running, which could lead to weird races.
289289
// Pod locks come before container locks, so do this first.
290290
if c.config.Pod != "" {
291291
// If we get an error, the pod was probably removed.
@@ -856,58 +856,7 @@ func (c *Container) Cleanup(ctx context.Context, onlyStopped bool) error {
856856
}
857857
}
858858

859-
// Check if state is good
860-
if !c.ensureState(define.ContainerStateConfigured, define.ContainerStateCreated, define.ContainerStateStopped, define.ContainerStateStopping, define.ContainerStateExited) {
861-
return fmt.Errorf("container %s is running or paused, refusing to clean up: %w", c.ID(), define.ErrCtrStateInvalid)
862-
}
863-
if onlyStopped && !c.ensureState(define.ContainerStateStopped) {
864-
return fmt.Errorf("container %s is not stopped and only cleanup for a stopped container was requested: %w", c.ID(), define.ErrCtrStateInvalid)
865-
}
866-
867-
// if the container was not created in the oci runtime or was already cleaned up, then do nothing
868-
if c.ensureState(define.ContainerStateConfigured, define.ContainerStateExited) {
869-
return nil
870-
}
871-
872-
// Handle restart policy.
873-
// Returns a bool indicating whether we actually restarted.
874-
// If we did, don't proceed to cleanup - just exit.
875-
didRestart, err := c.handleRestartPolicy(ctx)
876-
if err != nil {
877-
return err
878-
}
879-
if didRestart {
880-
return nil
881-
}
882-
883-
// If we didn't restart, we perform a normal cleanup
884-
885-
// make sure all the container processes are terminated if we are running without a pid namespace.
886-
hasPidNs := false
887-
if c.config.Spec.Linux != nil {
888-
for _, i := range c.config.Spec.Linux.Namespaces {
889-
if i.Type == spec.PIDNamespace {
890-
hasPidNs = true
891-
break
892-
}
893-
}
894-
}
895-
if !hasPidNs {
896-
// do not fail on errors
897-
_ = c.ociRuntime.KillContainer(c, uint(unix.SIGKILL), true)
898-
}
899-
900-
// Check for running exec sessions
901-
sessions, err := c.getActiveExecSessions()
902-
if err != nil {
903-
return err
904-
}
905-
if len(sessions) > 0 {
906-
return fmt.Errorf("container %s has active exec sessions, refusing to clean up: %w", c.ID(), define.ErrCtrStateInvalid)
907-
}
908-
909-
defer c.newContainerEvent(events.Cleanup)
910-
return c.cleanup(ctx)
859+
return c.fullCleanup(ctx, onlyStopped)
911860
}
912861

913862
// Batch starts a batch operation on the given container

libpod/container_graph.go

Lines changed: 193 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,18 @@ package libpod
44

55
import (
66
"context"
7+
"errors"
78
"fmt"
89
"strings"
10+
"sync"
911

1012
"github.com/containers/podman/v5/libpod/define"
13+
"github.com/containers/podman/v5/pkg/parallel"
1114
"github.com/sirupsen/logrus"
1215
)
1316

1417
type containerNode struct {
18+
lock sync.Mutex
1519
id string
1620
container *Container
1721
dependsOn []*containerNode
@@ -284,99 +288,241 @@ func startNode(ctx context.Context, node *containerNode, setError bool, ctrError
284288
}
285289
}
286290

287-
// Visit a node on the container graph and remove it, or set an error if it
288-
// failed to remove. Only intended for use in pod removal; do *not* use when
289-
// removing individual containers.
290-
// All containers are assumed to be *UNLOCKED* on running this function.
291-
// Container locks will be acquired as necessary.
292-
// Pod and infraID are optional. If a pod is given it must be *LOCKED*.
293-
func removeNode(ctx context.Context, node *containerNode, pod *Pod, force bool, timeout *uint, setError bool, ctrErrors map[string]error, ctrsVisited map[string]bool, ctrNamedVolumes map[string]*ContainerNamedVolume) {
291+
// Contains all details required for traversing the container graph.
292+
type nodeTraversal struct {
293+
// Protects reads and writes to the two maps.
294+
lock sync.Mutex
295+
// Optional. but *MUST* be locked.
296+
// Should NOT be changed once a traversal is started.
297+
pod *Pod
298+
// Function to execute on the individual container being acted on.
299+
// Should NOT be changed once a traversal is started.
300+
actionFunc func(ctr *Container, pod *Pod) error
301+
// Shared list of errors for all containers currently acted on.
302+
ctrErrors map[string]error
303+
// Shared list of what containers have been visited.
304+
ctrsVisited map[string]bool
305+
}
306+
307+
// Perform a traversal of the graph in an inwards direction - meaning from nodes
308+
// with no dependencies, recursing inwards to the nodes they depend on.
309+
// Safe to run in parallel on multiple nodes.
310+
func traverseNodeInwards(node *containerNode, nodeDetails *nodeTraversal, setError bool) {
311+
node.lock.Lock()
312+
294313
// If we already visited this node, we're done.
295-
if ctrsVisited[node.id] {
314+
nodeDetails.lock.Lock()
315+
visited := nodeDetails.ctrsVisited[node.id]
316+
nodeDetails.lock.Unlock()
317+
if visited {
318+
node.lock.Unlock()
296319
return
297320
}
298321

299322
// Someone who depends on us failed.
300323
// Mark us as failed and recurse.
301324
if setError {
302-
ctrsVisited[node.id] = true
303-
ctrErrors[node.id] = fmt.Errorf("a container that depends on container %s could not be removed: %w", node.id, define.ErrCtrStateInvalid)
325+
nodeDetails.lock.Lock()
326+
nodeDetails.ctrsVisited[node.id] = true
327+
nodeDetails.ctrErrors[node.id] = fmt.Errorf("a container that depends on container %s could not be stopped: %w", node.id, define.ErrCtrStateInvalid)
328+
nodeDetails.lock.Unlock()
329+
330+
node.lock.Unlock()
304331

305332
// Hit anyone who depends on us, set errors there as well.
306333
for _, successor := range node.dependsOn {
307-
removeNode(ctx, successor, pod, force, timeout, true, ctrErrors, ctrsVisited, ctrNamedVolumes)
334+
traverseNodeInwards(successor, nodeDetails, true)
308335
}
336+
337+
return
309338
}
310339

311340
// Does anyone still depend on us?
312-
// Cannot remove if true. Once all our dependencies have been removed,
313-
// we will be removed.
341+
// Cannot stop if true. Once all our dependencies have been stopped,
342+
// we will be stopped.
314343
for _, dep := range node.dependedOn {
315344
// The container that depends on us hasn't been removed yet.
316345
// OK to continue on
317-
if ok := ctrsVisited[dep.id]; !ok {
346+
nodeDetails.lock.Lock()
347+
ok := nodeDetails.ctrsVisited[dep.id]
348+
nodeDetails.lock.Unlock()
349+
if !ok {
350+
node.lock.Unlock()
318351
return
319352
}
320353
}
321354

322-
// Going to try to remove the node, mark us as visited
323-
ctrsVisited[node.id] = true
324-
325355
ctrErrored := false
356+
if err := nodeDetails.actionFunc(node.container, nodeDetails.pod); err != nil {
357+
ctrErrored = true
358+
nodeDetails.lock.Lock()
359+
nodeDetails.ctrErrors[node.id] = err
360+
nodeDetails.lock.Unlock()
361+
}
326362

327-
// Verify that all that depend on us are gone.
328-
// Graph traversal should guarantee this is true, but this isn't that
329-
// expensive, and it's better to be safe.
330-
for _, dep := range node.dependedOn {
331-
if _, err := node.container.runtime.GetContainer(dep.id); err == nil {
332-
ctrErrored = true
333-
ctrErrors[node.id] = fmt.Errorf("a container that depends on container %s still exists: %w", node.id, define.ErrDepExists)
334-
}
363+
// Mark as visited *only after* finished with operation.
364+
// This ensures that the operation has completed, one way or the other.
365+
// If an error was set, only do this after the viral ctrErrored
366+
// propagates in traverseNodeInwards below.
367+
// Same with the node lock - we don't want to release it until we are
368+
// marked as visited.
369+
if !ctrErrored {
370+
nodeDetails.lock.Lock()
371+
nodeDetails.ctrsVisited[node.id] = true
372+
nodeDetails.lock.Unlock()
373+
374+
node.lock.Unlock()
335375
}
336376

337-
// Lock the container
338-
node.container.lock.Lock()
377+
// Recurse to anyone who we depend on and work on them
378+
for _, successor := range node.dependsOn {
379+
traverseNodeInwards(successor, nodeDetails, ctrErrored)
380+
}
339381

340-
// Gate all subsequent bits behind a ctrErrored check - we don't want to
341-
// proceed if a previous step failed.
342-
if !ctrErrored {
343-
if err := node.container.syncContainer(); err != nil {
344-
ctrErrored = true
345-
ctrErrors[node.id] = err
382+
// If we propagated an error, finally mark us as visited here, after
383+
// all nodes we traverse to have already been marked failed.
384+
// If we don't do this, there is a race condition where a node could try
385+
// and perform its operation before it was marked failed by the
386+
// traverseNodeInwards triggered by this process.
387+
if ctrErrored {
388+
nodeDetails.lock.Lock()
389+
nodeDetails.ctrsVisited[node.id] = true
390+
nodeDetails.lock.Unlock()
391+
392+
node.lock.Unlock()
393+
}
394+
}
395+
396+
// Stop all containers in the given graph, assumed to be a graph of pod.
397+
// Pod is mandatory and should be locked.
398+
func stopContainerGraph(ctx context.Context, graph *ContainerGraph, pod *Pod, timeout *uint, cleanup bool) (map[string]error, error) {
399+
// Are there actually any containers in the graph?
400+
// If not, return immediately.
401+
if len(graph.nodes) == 0 {
402+
return map[string]error{}, nil
403+
}
404+
405+
nodeDetails := new(nodeTraversal)
406+
nodeDetails.pod = pod
407+
nodeDetails.ctrErrors = make(map[string]error)
408+
nodeDetails.ctrsVisited = make(map[string]bool)
409+
410+
traversalFunc := func(ctr *Container, pod *Pod) error {
411+
ctr.lock.Lock()
412+
defer ctr.lock.Unlock()
413+
414+
if err := ctr.syncContainer(); err != nil {
415+
return err
416+
}
417+
418+
realTimeout := ctr.config.StopTimeout
419+
if timeout != nil {
420+
realTimeout = *timeout
346421
}
422+
423+
if err := ctr.stop(realTimeout); err != nil && !errors.Is(err, define.ErrCtrStateInvalid) && !errors.Is(err, define.ErrCtrStopped) {
424+
return err
425+
}
426+
427+
if cleanup {
428+
return ctr.fullCleanup(ctx, false)
429+
}
430+
431+
return nil
347432
}
433+
nodeDetails.actionFunc = traversalFunc
348434

349-
if !ctrErrored {
350-
for _, vol := range node.container.config.NamedVolumes {
435+
doneChans := make([]<-chan error, 0, len(graph.notDependedOnNodes))
436+
437+
// Parallel enqueue jobs for all our starting nodes.
438+
if len(graph.notDependedOnNodes) == 0 {
439+
return nil, fmt.Errorf("no containers in pod %s are not dependencies of other containers, unable to stop", pod.ID())
440+
}
441+
for _, node := range graph.notDependedOnNodes {
442+
doneChan := parallel.Enqueue(ctx, func() error {
443+
traverseNodeInwards(node, nodeDetails, false)
444+
return nil
445+
})
446+
doneChans = append(doneChans, doneChan)
447+
}
448+
449+
// We don't care about the returns values, these functions always return nil
450+
// But we do need all of the parallel jobs to terminate.
451+
for _, doneChan := range doneChans {
452+
<-doneChan
453+
}
454+
455+
return nodeDetails.ctrErrors, nil
456+
}
457+
458+
// Remove all containers in the given graph
459+
// Pod is optional, and must be locked if given.
460+
func removeContainerGraph(ctx context.Context, graph *ContainerGraph, pod *Pod, timeout *uint, force bool) (map[string]*ContainerNamedVolume, map[string]bool, map[string]error, error) {
461+
// Are there actually any containers in the graph?
462+
// If not, return immediately.
463+
if len(graph.nodes) == 0 {
464+
return nil, nil, nil, nil
465+
}
466+
467+
nodeDetails := new(nodeTraversal)
468+
nodeDetails.pod = pod
469+
nodeDetails.ctrErrors = make(map[string]error)
470+
nodeDetails.ctrsVisited = make(map[string]bool)
471+
472+
ctrNamedVolumes := make(map[string]*ContainerNamedVolume)
473+
474+
traversalFunc := func(ctr *Container, pod *Pod) error {
475+
ctr.lock.Lock()
476+
defer ctr.lock.Unlock()
477+
478+
if err := ctr.syncContainer(); err != nil {
479+
return err
480+
}
481+
482+
for _, vol := range ctr.config.NamedVolumes {
351483
ctrNamedVolumes[vol.Name] = vol
352484
}
353485

354-
if pod != nil && pod.state.InfraContainerID == node.id {
486+
if pod != nil && pod.state.InfraContainerID == ctr.ID() {
355487
pod.state.InfraContainerID = ""
356488
if err := pod.save(); err != nil {
357-
ctrErrored = true
358-
ctrErrors[node.id] = fmt.Errorf("error removing infra container %s from pod %s: %w", node.id, pod.ID(), err)
489+
return fmt.Errorf("error removing infra container %s from pod %s: %w", ctr.ID(), pod.ID(), err)
359490
}
360491
}
361-
}
362492

363-
if !ctrErrored {
364493
opts := ctrRmOpts{
365494
Force: force,
366495
RemovePod: true,
367496
Timeout: timeout,
368497
}
369498

370-
if _, _, err := node.container.runtime.removeContainer(ctx, node.container, opts); err != nil {
371-
ctrErrored = true
372-
ctrErrors[node.id] = err
499+
if _, _, err := ctr.runtime.removeContainer(ctx, ctr, opts); err != nil {
500+
return err
373501
}
502+
503+
return nil
374504
}
505+
nodeDetails.actionFunc = traversalFunc
375506

376-
node.container.lock.Unlock()
507+
doneChans := make([]<-chan error, 0, len(graph.notDependedOnNodes))
377508

378-
// Recurse to anyone who we depend on and remove them
379-
for _, successor := range node.dependsOn {
380-
removeNode(ctx, successor, pod, force, timeout, ctrErrored, ctrErrors, ctrsVisited, ctrNamedVolumes)
509+
// Parallel enqueue jobs for all our starting nodes.
510+
if len(graph.notDependedOnNodes) == 0 {
511+
return nil, nil, nil, fmt.Errorf("no containers in graph are not dependencies of other containers, unable to stop")
381512
}
513+
for _, node := range graph.notDependedOnNodes {
514+
doneChan := parallel.Enqueue(ctx, func() error {
515+
traverseNodeInwards(node, nodeDetails, false)
516+
return nil
517+
})
518+
doneChans = append(doneChans, doneChan)
519+
}
520+
521+
// We don't care about the returns values, these functions always return nil
522+
// But we do need all of the parallel jobs to terminate.
523+
for _, doneChan := range doneChans {
524+
<-doneChan
525+
}
526+
527+
return ctrNamedVolumes, nodeDetails.ctrsVisited, nodeDetails.ctrErrors, nil
382528
}

0 commit comments

Comments
 (0)