Skip to content

Commit c4b8362

Browse files
Merge pull request #1102 from petr-muller/ocpbugs-22442-test-run-graph-mid-task-cancellation-flake-actual-fix
OCPBUGS-22442: Fix `TestRunGraph/mid-task_cancellation_with_work_in_queue_does_not_deadlock` flake
2 parents 90da0da + 70db253 commit c4b8362

File tree

2 files changed

+11
-1
lines changed

2 files changed

+11
-1
lines changed

pkg/payload/task_graph.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -478,6 +478,17 @@ func RunGraph(ctx context.Context, graph *TaskGraph, maxParallelism int, fn func
478478
defer utilruntime.HandleCrash()
479479
defer wg.Done()
480480
for {
481+
// First, make sure the worker was not signalled to cancel. This may seem redundant with the <-ctx.Done() below,
482+
// but it is necessary to properly handle the case where cancellation occurs while the worker is processing a
483+
// task. Go `select` is nondeterministic when multiple cases are ready, so when the worker finishes a task,
484+
// starts another loop and both the ctx.Done() and workCh cases are ready, Go could choose either of them,
485+
// potentially starting a new task even though the worker was supposed to stop. Checking cancellation here makes
486+
// the race window much smaller (cancellation would need to happen between this check and the select).
487+
if ctx.Err() != nil {
488+
klog.V(2).Infof("Worker %d: Received cancel signal", worker)
489+
return
490+
}
491+
481492
select {
482493
case <-ctx.Done():
483494
klog.V(2).Infof("Worker %d: Received cancel signal while waiting for work", worker)

pkg/payload/task_graph_test.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -898,7 +898,6 @@ func TestRunGraph(t *testing.T) {
898898
return err
899899
}
900900
cancelFn()
901-
// time.Sleep(time.Second)
902901
return nil
903902
},
904903
"*": func(t *testing.T, name string, ctx context.Context, cancelFn func()) error {

0 commit comments

Comments
 (0)