Skip to content

Commit e426237

Browse files
authored
Make sure that the remove cluster is skipped too and add more debugging output for the plugin (#2154)
1 parent f2bdfb9 commit e426237

File tree

2 files changed

+107
-1
lines changed

2 files changed

+107
-1
lines changed

e2e/test_operator_plugin/operator_plugin_test.go

Lines changed: 105 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@ import (
3030
"log"
3131
"time"
3232

33+
fdbv1beta2 "github.com/FoundationDB/fdb-kubernetes-operator/api/v1beta2"
34+
3335
"github.com/FoundationDB/fdb-kubernetes-operator/e2e/fixtures"
3436
. "github.com/onsi/ginkgo/v2"
3537
. "github.com/onsi/gomega"
@@ -95,6 +97,9 @@ var _ = Describe("Operator Plugin", Label("e2e", "pr"), func() {
9597
remoteSatellite := fdbCluster.GetRemoteSatellite()
9698
remoteSatellite.SetSkipReconciliation(true)
9799

100+
remote := fdbCluster.GetRemote()
101+
remote.SetSkipReconciliation(true)
102+
98103
var wg errgroup.Group
99104
log.Println("Delete Pods in primary")
100105
wg.Go(func() error {
@@ -127,7 +132,6 @@ var _ = Describe("Operator Plugin", Label("e2e", "pr"), func() {
127132
// Wait a short amount of time to let the cluster see that the primary and primary satellite is down.
128133
time.Sleep(5 * time.Second)
129134

130-
remote := fdbCluster.GetRemote()
131135
// Ensure the cluster is unavailable.
132136
Eventually(func() bool {
133137
return remote.GetStatus().Client.DatabaseStatus.Available
@@ -157,4 +161,104 @@ var _ = Describe("Operator Plugin", Label("e2e", "pr"), func() {
157161
}).WithTimeout(2 * time.Minute).WithPolling(1 * time.Second).Should(BeTrue())
158162
})
159163
})
164+
165+
// TODO(johscheuer): Enable once https://github.com/FoundationDB/fdb-kubernetes-operator/issues/2153 is fixed.
166+
PWhen("all Pods in the primary and satellites are down with", func() {
167+
BeforeEach(func() {
168+
runningVersion := fdbCluster.GetPrimary().GetCluster().GetRunningVersion()
169+
parsedVersion, err := fdbv1beta2.ParseFdbVersion(runningVersion)
170+
Expect(err).NotTo(HaveOccurred())
171+
172+
if !parsedVersion.SupportsDNSInClusterFile() {
173+
Skip(fmt.Sprintf("Current FDB version: \"%s\" doesn't support DNS names in the cluster file", runningVersion))
174+
}
175+
})
176+
177+
When("DNS names in the cluster file are supported", func() {
178+
BeforeEach(func() {
179+
var errGroup errgroup.Group
180+
// Enable DNS names in the cluster file for the whole cluster.
181+
for _, cluster := range fdbCluster.GetAllClusters() {
182+
target := cluster
183+
errGroup.Go(func() error {
184+
return target.SetUseDNSInClusterFile(true)
185+
})
186+
}
187+
Expect(errGroup.Wait()).NotTo(HaveOccurred())
188+
189+
// This tests is a destructive test where the cluster will stop working for some period.
190+
primary := fdbCluster.GetPrimary()
191+
primary.SetSkipReconciliation(true)
192+
193+
primarySatellite := fdbCluster.GetPrimarySatellite()
194+
primarySatellite.SetSkipReconciliation(true)
195+
196+
remoteSatellite := fdbCluster.GetRemoteSatellite()
197+
remoteSatellite.SetSkipReconciliation(true)
198+
199+
remote := fdbCluster.GetRemote()
200+
remote.SetSkipReconciliation(true)
201+
202+
var wg errgroup.Group
203+
log.Println("Delete Pods in primary")
204+
wg.Go(func() error {
205+
for _, pod := range primary.GetPods().Items {
206+
factory.DeletePod(&pod)
207+
}
208+
209+
return nil
210+
})
211+
212+
log.Println("Delete Pods in primary satellite")
213+
wg.Go(func() error {
214+
for _, pod := range primarySatellite.GetPods().Items {
215+
factory.DeletePod(&pod)
216+
}
217+
218+
return nil
219+
})
220+
221+
log.Println("Delete Pods in remote satellite")
222+
wg.Go(func() error {
223+
for _, pod := range remoteSatellite.GetPods().Items {
224+
factory.DeletePod(&pod)
225+
}
226+
227+
return nil
228+
})
229+
230+
Expect(wg.Wait()).NotTo(HaveOccurred())
231+
// Wait a short amount of time to let the cluster see that the primary and primary satellite is down.
232+
time.Sleep(5 * time.Second)
233+
234+
// Ensure the cluster is unavailable.
235+
Eventually(func() bool {
236+
return remote.GetStatus().Client.DatabaseStatus.Available
237+
}).WithTimeout(2 * time.Minute).WithPolling(1 * time.Second).Should(BeFalse())
238+
})
239+
240+
AfterEach(func() {
241+
log.Println("Recreate cluster")
242+
// Delete the broken cluster.
243+
fdbCluster.Delete()
244+
// Recreate the cluster to make sure the next tests can proceed
245+
fdbCluster = factory.CreateFdbHaCluster(clusterConfig, clusterOptions...)
246+
})
247+
248+
It("should recover the coordinators", func() {
249+
remote := fdbCluster.GetRemote()
250+
// Pick one operator pod and execute the recovery command
251+
operatorPod := factory.RandomPickOnePod(factory.GetOperatorPods(remote.Namespace()).Items)
252+
log.Println("operatorPod:", operatorPod.Name)
253+
stdout, stderr, err := factory.ExecuteCmdOnPod(context.Background(), &operatorPod, "manager", fmt.Sprintf("kubectl-fdb -n %s recover-multi-region-cluster --version-check=false --wait=false %s", remote.Namespace(), remote.Name()), false)
254+
log.Println("stdout:", stdout, "stderr:", stderr)
255+
Expect(err).NotTo(HaveOccurred())
256+
257+
// Ensure the cluster is available again.
258+
Eventually(func() bool {
259+
return remote.GetStatus().Client.DatabaseStatus.Available
260+
}).WithTimeout(2 * time.Minute).WithPolling(1 * time.Second).Should(BeTrue())
261+
})
262+
})
263+
})
160264
})

kubectl-fdb/cmd/recover_multi_region_cluster.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,8 @@ func recoverMultiRegionCluster(cmd *cobra.Command, opts recoverMultiRegionCluste
182182
return parseErr
183183
}
184184

185+
cmd.Println("checking pod:", pod.Name, "address:", addr, "pod IPs:", pod.Status.PodIP, "machineAddr:", addr.MachineAddress())
186+
185187
loopPod := pod
186188
if coordinatorAddr, ok := coordinators[addr.MachineAddress()]; ok {
187189
cmd.Println("Found coordinator for cluster", pod.Name, "address", addr.MachineAddress())

0 commit comments

Comments
 (0)