Skip to content

Commit 5886bb2

Browse files
authored
If the database is unavailable and caching is enabled, allow operator to proceed (#1932)
* If the database is unavailable and caching is enabled we should allow the operator to proceed and just try to refetch the machine-readable status later again
1 parent 255825b commit 5886bb2

File tree

5 files changed

+93
-10
lines changed

5 files changed

+93
-10
lines changed

controllers/cluster_controller.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,7 @@ func (r *FoundationDBClusterReconciler) Reconcile(ctx context.Context, request c
137137
clusterLog.Info("Fetch machine-readable status for reconcilitation loop", "cacheStatus", cacheStatus)
138138
status, err = r.getStatusFromClusterOrDummyStatus(clusterLog, cluster)
139139
if err != nil {
140-
return ctrl.Result{Requeue: true, RequeueAfter: 10 * time.Second}, err
140+
clusterLog.Info("could not fetch machine-readable status and therefore didn't cache the it")
141141
}
142142
}
143143

controllers/replace_failed_process_groups.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,13 +48,13 @@ func (c replaceFailedProcessGroups) reconcile(ctx context.Context, r *Foundation
4848
if status == nil {
4949
adminClient, err := r.DatabaseClientProvider.GetAdminClient(cluster, r)
5050
if err != nil {
51-
return &requeue{curError: err}
51+
return &requeue{curError: err, delayedRequeue: true}
5252
}
5353
defer adminClient.Close()
5454

5555
status, err = adminClient.GetStatus()
5656
if err != nil {
57-
return &requeue{curError: err}
57+
return &requeue{curError: err, delayedRequeue: true}
5858
}
5959
}
6060

controllers/update_lock_configuration.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,12 +40,12 @@ func (updateLockConfiguration) reconcile(_ context.Context, r *FoundationDBClust
4040

4141
lockClient, err := r.getLockClient(cluster)
4242
if err != nil {
43-
return &requeue{curError: err}
43+
return &requeue{curError: err, delayedRequeue: true}
4444
}
4545

4646
err = lockClient.UpdateDenyList(cluster.Spec.LockOptions.DenyList)
4747
if err != nil {
48-
return &requeue{curError: err}
48+
return &requeue{curError: err, delayedRequeue: true}
4949
}
5050

5151
return nil

controllers/update_status.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ func (updateStatus) reconcile(ctx context.Context, r *FoundationDBClusterReconci
6767
var err error
6868
databaseStatus, err = r.getStatusFromClusterOrDummyStatus(logger, cluster)
6969
if err != nil {
70-
return &requeue{curError: fmt.Errorf("update_status error fetching status: %w", err)}
70+
return &requeue{curError: fmt.Errorf("update_status error fetching status: %w", err), delayedRequeue: true}
7171
}
7272
}
7373

e2e/test_operator/operator_test.go

Lines changed: 87 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -49,9 +49,10 @@ import (
4949
)
5050

5151
var (
52-
factory *fixtures.Factory
53-
fdbCluster *fixtures.FdbCluster
54-
testOptions *fixtures.FactoryOptions
52+
factory *fixtures.Factory
53+
fdbCluster *fixtures.FdbCluster
54+
testOptions *fixtures.FactoryOptions
55+
scheduleInjectPodKill *fixtures.ChaosMeshExperiment
5556
)
5657

5758
func init() {
@@ -81,7 +82,7 @@ var _ = BeforeSuite(func() {
8182

8283
// In order to test the robustness of the operator we try to kill the operator Pods every minute.
8384
if factory.ChaosTestsEnabled() {
84-
factory.ScheduleInjectPodKill(
85+
scheduleInjectPodKill = factory.ScheduleInjectPodKill(
8586
fixtures.GetOperatorSelector(fdbCluster.Namespace()),
8687
"*/2 * * * *",
8788
chaosmesh.OneMode,
@@ -1893,4 +1894,86 @@ var _ = Describe("Operator", Label("e2e", "pr"), func() {
18931894
})
18941895
})
18951896
})
1897+
1898+
When("the cluster makes use of DNS in the cluster file", func() {
1899+
var initialSetting bool
1900+
1901+
BeforeEach(func() {
1902+
// Until the race condition is resolved in the FDB go bindings make sure the operator is not restarted.
1903+
factory.DeleteChaosMeshExperimentSafe(scheduleInjectPodKill)
1904+
cluster := fdbCluster.GetCluster()
1905+
parsedVersion, err := fdbv1beta2.ParseFdbVersion(cluster.Status.RunningVersion)
1906+
Expect(err).NotTo(HaveOccurred())
1907+
1908+
if !parsedVersion.SupportsDNSInClusterFile() {
1909+
Skip(fmt.Sprintf("current FoundationDB version %s doesn't support DNS", parsedVersion.String()))
1910+
}
1911+
1912+
initialSetting = cluster.UseDNSInClusterFile()
1913+
if !cluster.UseDNSInClusterFile() {
1914+
Expect(fdbCluster.SetUseDNSInClusterFile(true)).ToNot(HaveOccurred())
1915+
}
1916+
})
1917+
1918+
AfterEach(func() {
1919+
Expect(fdbCluster.SetUseDNSInClusterFile(initialSetting)).ToNot(HaveOccurred())
1920+
1921+
if factory.ChaosTestsEnabled() {
1922+
scheduleInjectPodKill = factory.ScheduleInjectPodKill(
1923+
fixtures.GetOperatorSelector(fdbCluster.Namespace()),
1924+
"*/2 * * * *",
1925+
chaosmesh.OneMode,
1926+
)
1927+
}
1928+
})
1929+
1930+
When("all Pods are deleted", func() {
1931+
var initialPodsCnt int
1932+
var initialReplaceTime time.Duration
1933+
1934+
BeforeEach(func() {
1935+
availabilityCheck = false
1936+
initialReplaceTime = time.Duration(pointer.IntDeref(
1937+
fdbCluster.GetClusterSpec().AutomationOptions.Replacements.FailureDetectionTimeSeconds,
1938+
90,
1939+
)) * time.Second
1940+
Expect(fdbCluster.SetAutoReplacements(false, 30*time.Hour)).ShouldNot(HaveOccurred())
1941+
// Make sure the operator is not taking any action to prevent any race condition.
1942+
Expect(fdbCluster.SetSkipReconciliation(true)).NotTo(HaveOccurred())
1943+
1944+
// Delete all Pods
1945+
pods := fdbCluster.GetPods()
1946+
initialPodsCnt = len(pods.Items)
1947+
for _, pod := range pods.Items {
1948+
podToDelete := &pod
1949+
factory.Delete(podToDelete)
1950+
}
1951+
1952+
// Make sure the Pods are all deleted.
1953+
Eventually(func() []corev1.Pod {
1954+
return fdbCluster.GetPods().Items
1955+
}).WithTimeout(5 * time.Minute).WithPolling(2 * time.Second).Should(BeEmpty())
1956+
1957+
// Enable the operator again
1958+
Expect(fdbCluster.SetSkipReconciliation(false)).NotTo(HaveOccurred())
1959+
})
1960+
1961+
It("should recreate all Pods and bring the cluster into a healthy state again", func() {
1962+
Eventually(func() int {
1963+
return len(fdbCluster.GetPods().Items)
1964+
}).WithTimeout(5 * time.Minute).WithPolling(2 * time.Second).Should(BeNumerically(">=", initialPodsCnt))
1965+
1966+
Eventually(func() bool {
1967+
return fdbCluster.GetStatus().Client.DatabaseStatus.Available
1968+
}).WithTimeout(5 * time.Minute).WithPolling(2 * time.Second).Should(BeTrue())
1969+
1970+
Expect(fdbCluster.WaitForReconciliation()).NotTo(HaveOccurred())
1971+
})
1972+
1973+
AfterEach(func() {
1974+
Expect(fdbCluster.SetSkipReconciliation(false)).NotTo(HaveOccurred())
1975+
Expect(fdbCluster.SetAutoReplacements(true, initialReplaceTime)).ShouldNot(HaveOccurred())
1976+
})
1977+
})
1978+
})
18961979
})

0 commit comments

Comments
 (0)