Skip to content

Commit 8244b52

Browse files
Merge pull request #1070 from Davoska/ocpbugs-32678-cvo-panics
OCPBUGS-32678: Fix a panic caused by a data race
2 parents df4cb12 + 1505f20 commit 8244b52

File tree

2 files changed

+79
-2
lines changed

2 files changed

+79
-2
lines changed

pkg/cvo/sync_worker.go

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,9 @@ type SyncWorker struct {
161161
// coordination between the sync loop and external callers
162162
notify chan string
163163
report chan SyncWorkerStatus
164+
// startApply is used to start the initial attempt to apply a payload. It may be
165+
// used consecutively to start additional attempts as well.
166+
startApply chan string
164167

165168
// lock guards changes to these fields
166169
lock sync.Mutex
@@ -199,7 +202,8 @@ func NewSyncWorker(retriever PayloadRetriever, builder payload.ResourceBuilder,
199202

200203
minimumReconcileInterval: reconcileInterval,
201204

202-
notify: make(chan string, 1),
205+
notify: make(chan string, 1),
206+
startApply: make(chan string, 1),
203207
// report is a large buffered channel to improve local testing - most consumers should invoke
204208
// Status() or use the result of calling Update() instead because the channel can be out of date
205209
// if the reader is not fast enough.
@@ -547,7 +551,7 @@ func (w *SyncWorker) Update(ctx context.Context, generation int64, desired confi
547551
}
548552
msg := "new work is available"
549553
select {
550-
case w.notify <- msg:
554+
case w.startApply <- msg:
551555
klog.V(2).Info("Notify the sync worker that new work is available")
552556
default:
553557
klog.V(2).Info("The sync worker has already been notified that new work is available")
@@ -565,6 +569,23 @@ func (w *SyncWorker) Start(ctx context.Context, maxWorkers int) {
565569
klog.V(2).Infof("Start: starting sync worker")
566570

567571
work := &SyncWork{}
572+
initialStartApplyReceived := make(chan string, 1) // a local channel to not cause a potential deadlock
573+
574+
// Until Update() has finished at least once, we do nothing.
575+
for loop := true; loop; {
576+
select {
577+
case <-ctx.Done():
578+
klog.V(2).Infof("The sync worker was shut down while waiting for the initial signal")
579+
return
580+
case <-w.notify:
581+
// Do not queue any retries until the worker has started
582+
klog.V(2).Infof("The sync worker was notified; however, it is waiting for the initial signal")
583+
case msg := <-w.startApply:
584+
klog.V(2).Infof("The sync worker has received the initial signal")
585+
initialStartApplyReceived <- msg
586+
loop = false
587+
}
588+
}
568589

569590
wait.Until(func() {
570591
consecutiveErrors := 0
@@ -582,6 +603,10 @@ func (w *SyncWorker) Start(ctx context.Context, maxWorkers int) {
582603
klog.V(2).Infof("Wait finished")
583604
case msg := <-w.notify:
584605
klog.V(2).Info(msg)
606+
case msg := <-w.startApply:
607+
klog.V(2).Info(msg)
608+
case msg := <-initialStartApplyReceived:
609+
klog.V(2).Info(msg)
585610
}
586611

587612
// determine whether we need to do work

pkg/cvo/sync_worker_test.go

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -476,3 +476,55 @@ func Test_equalDigest(t *testing.T) {
476476
})
477477
}
478478
}
479+
480+
func Test_SyncWorkerShouldNotPanicDueToNotifySignalAtStartUp(t *testing.T) {
481+
o, _, _, _, shutdownFn := setupCVOTest("testdata/panic")
482+
defer shutdownFn()
483+
syncChannel := make(chan bool)
484+
485+
// Start() should not cause a panic when the notify channel already contains elements at its startup
486+
ctx, cancel := context.WithCancel(context.Background())
487+
worker := o.configSync.(*SyncWorker)
488+
worker.notify <- "Notify the sync worker: Cluster operator A changed versions"
489+
go func() {
490+
worker.Start(ctx, 1)
491+
syncChannel <- true
492+
}()
493+
494+
// A panic must not occur due to the notify signal; wait a reasonable time for a potential panic
495+
<-time.After(3 * time.Second)
496+
497+
// Shut down the sync worker and wait for the confirmation
498+
cancel()
499+
select {
500+
case <-syncChannel:
501+
case <-time.After(3 * time.Second):
502+
t.Fatal("Sync worker did not shut down in time after its context was cancelled")
503+
}
504+
}
505+
506+
func Test_SyncWorkerShouldShutDownImmediatelyAtStartUpWhenContextCancelled(t *testing.T) {
507+
o, _, _, _, shutdownFn := setupCVOTest("testdata/panic")
508+
defer shutdownFn()
509+
syncChannel := make(chan bool)
510+
511+
// Start() shuts down immediately while waiting for its initial signal
512+
// Only the context cancellation signal is received
513+
ctx, cancel := context.WithCancel(context.Background())
514+
worker := o.configSync.(*SyncWorker)
515+
go func() {
516+
worker.Start(ctx, 1)
517+
syncChannel <- true
518+
}()
519+
520+
// A panic must not occur due to the notify signal; wait a reasonable time for a potential panic
521+
<-time.After(3 * time.Second)
522+
523+
// Shut down the sync worker and wait for the confirmation
524+
cancel()
525+
select {
526+
case <-syncChannel:
527+
case <-time.After(3 * time.Second):
528+
t.Fatal("Sync worker did not shut down in time after its context was cancelled")
529+
}
530+
}

0 commit comments

Comments
 (0)