pkg/cvo/sync_worker_test.go: Add test for panic caused by goroutine synchronization

DavidHurta · DavidHurta · commit 7f9cf621ce33 · 2024-07-15T14:24:36.000+02:00
After the CVO has started and its worker goroutines are starting to work, a panic may occur due to a data race. Originally (at 961873d), the `SyncWorker.Start()` method was exclusively waiting for a signal on the `SyncWorker.notify` channel from the `SyncWorker.Update()` method before attempting the first application of a payload. After the `Start()` method got the signal, its required values were already initialized by the `SyncWorker.Update()` method, such as the `SyncWorker.work` field. However, the 9222fa9 introduced another section of the CVO that can signal the `SyncWorker.notify` channel. The `SyncWorker.NotifyAboutManagedResourceActivity()` method now signals the channel upon an activity on a managed resource. This signal may occur before the `SyncWorker.Update()` has finished. The signal thus starts an attempt of the `SyncWorker.Start()` method to apply a payload before the `SyncWorker.work` field could have been set by the `SyncWorker.Update()` method. This currently results in an "invalid memory address or nil pointer dereference" panic in the `SyncWork.calculateNextFrom()` method because a nil pointer is passed by the `SyncWorker.Start()` method. This panic can be seen in the following logs [1, 2]. Be sure to notice messages such as `Notify the sync worker: Cluster operator etcd changed Degraded from "False" to "True"` caused by the `NotifyAboutManagedResourceActivity` method right before a panic occurs. This commit will add a test to check whether a panic will occur when the notify channel already contains an element at a startup of the Start() method. A desired behavior for the Start() method is not to cause a panic when a notify signal is received before the Update() method has time to finish. The cancellation of the Start() method's context must remain functional. This commit will test this as well. It is possible that a nonempty notify channel at the Start() method's startup will not have time to cause a panic in the implemented test. However, such a case is improbable due to the non-complexity of the underlying code, and a three-seconds timeout should be enough time to cause such a panic. The alternative is to implement a fine-grained status of the sync worker, which would add additional code and complexity. [1] https://gcsweb-ci.apps.ci.l2s4.p1.openshiftapps.com/gcs/test-platform-results/logs/periodic-ci-openshift-release-master-nightly-4.11-e2e-aws-workers-rhel8/1804819848023248896/artifacts/e2e-aws-workers-rhel8/gather-extra/artifacts/pods/openshift-cluster-version_cluster-version-operator-545d8cb5db-v2bf4_cluster-version-operator_previous.log [2] https://gcsweb-ci.apps.ci.l2s4.p1.openshiftapps.com/gcs/test-platform-results/logs/periodic-ci-openshift-release-master-nightly-4.11-e2e-ovirt/1805174927066664960/artifacts/e2e-ovirt/gather-extra/artifacts/pods/openshift-cluster-version_cluster-version-operator-849c49896b-wnn8v_cluster-version-operator_previous.log
diff --git a/pkg/cvo/sync_worker_test.go b/pkg/cvo/sync_worker_test.go
@@ -476,3 +476,55 @@ func Test_equalDigest(t *testing.T) {
 		})
 	}
 }
+
+func Test_SyncWorkerShouldNotPanicDueToNotifySignalAtStartUp(t *testing.T) {
+	o, _, _, _, shutdownFn := setupCVOTest("testdata/panic")
+	defer shutdownFn()
+	syncChannel := make(chan bool)
+
+	// Start() should not cause a panic when the notify channel already contains elements at its startup
+	ctx, cancel := context.WithCancel(context.Background())
+	worker := o.configSync.(*SyncWorker)
+	worker.notify <- "Notify the sync worker: Cluster operator A changed versions"
+	go func() {
+		worker.Start(ctx, 1)
+		syncChannel <- true
+	}()
+
+	// A panic must not occur due to the notify signal; wait a reasonable time for a potential panic
+	<-time.After(3 * time.Second)
+
+	// Shut down the sync worker and wait for the confirmation
+	cancel()
+	select {
+	case <-syncChannel:
+	case <-time.After(3 * time.Second):
+		t.Fatal("Sync worker did not shut down in time after its context was cancelled")
+	}
+}
+
+func Test_SyncWorkerShouldShutDownImmediatelyAtStartUpWhenContextCancelled(t *testing.T) {
+	o, _, _, _, shutdownFn := setupCVOTest("testdata/panic")
+	defer shutdownFn()
+	syncChannel := make(chan bool)
+
+	// Start() shuts down immediately while waiting for its initial signal
+	// Only the context cancellation signal is received
+	ctx, cancel := context.WithCancel(context.Background())
+	worker := o.configSync.(*SyncWorker)
+	go func() {
+		worker.Start(ctx, 1)
+		syncChannel <- true
+	}()
+
+	// A panic must not occur due to the notify signal; wait a reasonable time for a potential panic
+	<-time.After(3 * time.Second)
+
+	// Shut down the sync worker and wait for the confirmation
+	cancel()
+	select {
+	case <-syncChannel:
+	case <-time.After(3 * time.Second):
+		t.Fatal("Sync worker did not shut down in time after its context was cancelled")
+	}
+}