Skip to content

Commit 1505f20

Browse files
committed
pkg/cvo/sync_worker.go: Fix panic caused by data race
The bug is explained in more detail in the commit that is responsible for testing the bug's behavior. For more information, see the commit 7f9cf62 This commit will fix the issue by introducing a new channel to exclusively signal the start of the initial attempt of the application after the `SyncWorker.Update()` has finished. It may also be used for any consecutive signals to attempt an application. This ensures the `Update()` has finished at least once before we attempt to apply the payload. A new local channel is added to the `Start()` method to not cause a potential deadlock. The `Start()` method is the sole consumer of the `notify` and `startApply` channels. In a potential scenario, the `Start()` goroutine may consume an element from one of the channels and get starved. The goroutine would not be able to produce the element back into the channel and break out of the loop, because the channel could already be full. This would create a deadlock as there wouldn't be any consumers of the channels. An alternative is to utilize other Go structs; however, using channels requires minimal changes to the present code.
1 parent 7f9cf62 commit 1505f20

File tree

1 file changed

+27
-2
lines changed

1 file changed

+27
-2
lines changed

pkg/cvo/sync_worker.go

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,9 @@ type SyncWorker struct {
161161
// coordination between the sync loop and external callers
162162
notify chan string
163163
report chan SyncWorkerStatus
164+
// startApply is used to start the initial attempt to apply a payload. It may be
165+
// used consecutively to start additional attempts as well.
166+
startApply chan string
164167

165168
// lock guards changes to these fields
166169
lock sync.Mutex
@@ -199,7 +202,8 @@ func NewSyncWorker(retriever PayloadRetriever, builder payload.ResourceBuilder,
199202

200203
minimumReconcileInterval: reconcileInterval,
201204

202-
notify: make(chan string, 1),
205+
notify: make(chan string, 1),
206+
startApply: make(chan string, 1),
203207
// report is a large buffered channel to improve local testing - most consumers should invoke
204208
// Status() or use the result of calling Update() instead because the channel can be out of date
205209
// if the reader is not fast enough.
@@ -547,7 +551,7 @@ func (w *SyncWorker) Update(ctx context.Context, generation int64, desired confi
547551
}
548552
msg := "new work is available"
549553
select {
550-
case w.notify <- msg:
554+
case w.startApply <- msg:
551555
klog.V(2).Info("Notify the sync worker that new work is available")
552556
default:
553557
klog.V(2).Info("The sync worker has already been notified that new work is available")
@@ -565,6 +569,23 @@ func (w *SyncWorker) Start(ctx context.Context, maxWorkers int) {
565569
klog.V(2).Infof("Start: starting sync worker")
566570

567571
work := &SyncWork{}
572+
initialStartApplyReceived := make(chan string, 1) // a local channel to not cause a potential deadlock
573+
574+
// Until Update() has finished at least once, we do nothing.
575+
for loop := true; loop; {
576+
select {
577+
case <-ctx.Done():
578+
klog.V(2).Infof("The sync worker was shut down while waiting for the initial signal")
579+
return
580+
case <-w.notify:
581+
// Do not queue any retries until the worker has started
582+
klog.V(2).Infof("The sync worker was notified; however, it is waiting for the initial signal")
583+
case msg := <-w.startApply:
584+
klog.V(2).Infof("The sync worker has received the initial signal")
585+
initialStartApplyReceived <- msg
586+
loop = false
587+
}
588+
}
568589

569590
wait.Until(func() {
570591
consecutiveErrors := 0
@@ -582,6 +603,10 @@ func (w *SyncWorker) Start(ctx context.Context, maxWorkers int) {
582603
klog.V(2).Infof("Wait finished")
583604
case msg := <-w.notify:
584605
klog.V(2).Info(msg)
606+
case msg := <-w.startApply:
607+
klog.V(2).Info(msg)
608+
case msg := <-initialStartApplyReceived:
609+
klog.V(2).Info(msg)
585610
}
586611

587612
// determine whether we need to do work

0 commit comments

Comments
 (0)