@@ -30,6 +30,8 @@ import (
30
30
"log"
31
31
"time"
32
32
33
+ fdbv1beta2 "github.com/FoundationDB/fdb-kubernetes-operator/api/v1beta2"
34
+
33
35
"github.com/FoundationDB/fdb-kubernetes-operator/e2e/fixtures"
34
36
. "github.com/onsi/ginkgo/v2"
35
37
. "github.com/onsi/gomega"
@@ -95,6 +97,9 @@ var _ = Describe("Operator Plugin", Label("e2e", "pr"), func() {
95
97
remoteSatellite := fdbCluster .GetRemoteSatellite ()
96
98
remoteSatellite .SetSkipReconciliation (true )
97
99
100
+ remote := fdbCluster .GetRemote ()
101
+ remote .SetSkipReconciliation (true )
102
+
98
103
var wg errgroup.Group
99
104
log .Println ("Delete Pods in primary" )
100
105
wg .Go (func () error {
@@ -127,7 +132,6 @@ var _ = Describe("Operator Plugin", Label("e2e", "pr"), func() {
127
132
// Wait a short amount of time to let the cluster see that the primary and primary satellite is down.
128
133
time .Sleep (5 * time .Second )
129
134
130
- remote := fdbCluster .GetRemote ()
131
135
// Ensure the cluster is unavailable.
132
136
Eventually (func () bool {
133
137
return remote .GetStatus ().Client .DatabaseStatus .Available
@@ -157,4 +161,104 @@ var _ = Describe("Operator Plugin", Label("e2e", "pr"), func() {
157
161
}).WithTimeout (2 * time .Minute ).WithPolling (1 * time .Second ).Should (BeTrue ())
158
162
})
159
163
})
164
+
165
+ // TODO(johscheuer): Enable once https://github.com/FoundationDB/fdb-kubernetes-operator/issues/2153 is fixed.
166
+ PWhen ("all Pods in the primary and satellites are down with" , func () {
167
+ BeforeEach (func () {
168
+ runningVersion := fdbCluster .GetPrimary ().GetCluster ().GetRunningVersion ()
169
+ parsedVersion , err := fdbv1beta2 .ParseFdbVersion (runningVersion )
170
+ Expect (err ).NotTo (HaveOccurred ())
171
+
172
+ if ! parsedVersion .SupportsDNSInClusterFile () {
173
+ Skip (fmt .Sprintf ("Current FDB version: \" %s\" doesn't support DNS names in the cluster file" , runningVersion ))
174
+ }
175
+ })
176
+
177
+ When ("DNS names in the cluster file are supported" , func () {
178
+ BeforeEach (func () {
179
+ var errGroup errgroup.Group
180
+ // Enable DNS names in the cluster file for the whole cluster.
181
+ for _ , cluster := range fdbCluster .GetAllClusters () {
182
+ target := cluster
183
+ errGroup .Go (func () error {
184
+ return target .SetUseDNSInClusterFile (true )
185
+ })
186
+ }
187
+ Expect (errGroup .Wait ()).NotTo (HaveOccurred ())
188
+
189
+ // This tests is a destructive test where the cluster will stop working for some period.
190
+ primary := fdbCluster .GetPrimary ()
191
+ primary .SetSkipReconciliation (true )
192
+
193
+ primarySatellite := fdbCluster .GetPrimarySatellite ()
194
+ primarySatellite .SetSkipReconciliation (true )
195
+
196
+ remoteSatellite := fdbCluster .GetRemoteSatellite ()
197
+ remoteSatellite .SetSkipReconciliation (true )
198
+
199
+ remote := fdbCluster .GetRemote ()
200
+ remote .SetSkipReconciliation (true )
201
+
202
+ var wg errgroup.Group
203
+ log .Println ("Delete Pods in primary" )
204
+ wg .Go (func () error {
205
+ for _ , pod := range primary .GetPods ().Items {
206
+ factory .DeletePod (& pod )
207
+ }
208
+
209
+ return nil
210
+ })
211
+
212
+ log .Println ("Delete Pods in primary satellite" )
213
+ wg .Go (func () error {
214
+ for _ , pod := range primarySatellite .GetPods ().Items {
215
+ factory .DeletePod (& pod )
216
+ }
217
+
218
+ return nil
219
+ })
220
+
221
+ log .Println ("Delete Pods in remote satellite" )
222
+ wg .Go (func () error {
223
+ for _ , pod := range remoteSatellite .GetPods ().Items {
224
+ factory .DeletePod (& pod )
225
+ }
226
+
227
+ return nil
228
+ })
229
+
230
+ Expect (wg .Wait ()).NotTo (HaveOccurred ())
231
+ // Wait a short amount of time to let the cluster see that the primary and primary satellite is down.
232
+ time .Sleep (5 * time .Second )
233
+
234
+ // Ensure the cluster is unavailable.
235
+ Eventually (func () bool {
236
+ return remote .GetStatus ().Client .DatabaseStatus .Available
237
+ }).WithTimeout (2 * time .Minute ).WithPolling (1 * time .Second ).Should (BeFalse ())
238
+ })
239
+
240
+ AfterEach (func () {
241
+ log .Println ("Recreate cluster" )
242
+ // Delete the broken cluster.
243
+ fdbCluster .Delete ()
244
+ // Recreate the cluster to make sure the next tests can proceed
245
+ fdbCluster = factory .CreateFdbHaCluster (clusterConfig , clusterOptions ... )
246
+ })
247
+
248
+ It ("should recover the coordinators" , func () {
249
+ remote := fdbCluster .GetRemote ()
250
+ // Pick one operator pod and execute the recovery command
251
+ operatorPod := factory .RandomPickOnePod (factory .GetOperatorPods (remote .Namespace ()).Items )
252
+ log .Println ("operatorPod:" , operatorPod .Name )
253
+ stdout , stderr , err := factory .ExecuteCmdOnPod (context .Background (), & operatorPod , "manager" , fmt .Sprintf ("kubectl-fdb -n %s recover-multi-region-cluster --version-check=false --wait=false %s" , remote .Namespace (), remote .Name ()), false )
254
+ log .Println ("stdout:" , stdout , "stderr:" , stderr )
255
+ Expect (err ).NotTo (HaveOccurred ())
256
+
257
+ // Ensure the cluster is available again.
258
+ Eventually (func () bool {
259
+ return remote .GetStatus ().Client .DatabaseStatus .Available
260
+ }).WithTimeout (2 * time .Minute ).WithPolling (1 * time .Second ).Should (BeTrue ())
261
+ })
262
+ })
263
+ })
160
264
})
0 commit comments