Skip to content

Commit 566a57e

Browse files
authored
Adding a new e2e test for producing data loss (#2162)
* Adding a new e2e test for producing data loss
1 parent a1b7eba commit 566a57e

File tree

3 files changed

+248
-3
lines changed

3 files changed

+248
-3
lines changed

e2e/fixtures/fdb_cluster.go

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1840,9 +1840,14 @@ func (fdbCluster *FdbCluster) WriteKeyValue(
18401840
gomega.Expect(err).NotTo(gomega.HaveOccurred(), stderr)
18411841
}
18421842

1843-
// WriteKeyValues writes multiples key values into FDB.
1844-
func (fdbCluster *FdbCluster) WriteKeyValues(keyValues []KeyValue) {
1843+
// WriteKeyValuesWithTimeout writes multiples key values into FDB with the specified timeout.
1844+
func (fdbCluster *FdbCluster) WriteKeyValuesWithTimeout(keyValues []KeyValue, timeout int) {
18451845
for _, kv := range keyValues {
1846-
fdbCluster.WriteKeyValue(kv, 30)
1846+
fdbCluster.WriteKeyValue(kv, timeout)
18471847
}
18481848
}
1849+
1850+
// WriteKeyValues writes multiples key values into FDB.
1851+
func (fdbCluster *FdbCluster) WriteKeyValues(keyValues []KeyValue) {
1852+
fdbCluster.WriteKeyValuesWithTimeout(keyValues, 30)
1853+
}
Lines changed: 205 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,205 @@
1+
/*
2+
* operator_ha_failure_test.go
3+
*
4+
* This source file is part of the FoundationDB open source project
5+
*
6+
* Copyright 2018-2024 Apple Inc. and the FoundationDB project authors
7+
*
8+
* Licensed under the Apache License, Version 2.0 (the "License");
9+
* you may not use this file except in compliance with the License.
10+
* You may obtain a copy of the License at
11+
*
12+
* http://www.apache.org/licenses/LICENSE-2.0
13+
*
14+
* Unless required by applicable law or agreed to in writing, software
15+
* distributed under the License is distributed on an "AS IS" BASIS,
16+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17+
* See the License for the specific language governing permissions and
18+
* limitations under the License.
19+
*/
20+
21+
package operatorhafailure
22+
23+
/*
24+
This test suite contains destructive test cases for a multi-region FDB cluster (HA cluster).
25+
*/
26+
27+
import (
28+
"context"
29+
"fmt"
30+
"golang.org/x/sync/errgroup"
31+
"log"
32+
"time"
33+
34+
"github.com/FoundationDB/fdb-kubernetes-operator/e2e/fixtures"
35+
chaosmesh "github.com/chaos-mesh/chaos-mesh/api/v1alpha1"
36+
. "github.com/onsi/ginkgo/v2"
37+
. "github.com/onsi/gomega"
38+
)
39+
40+
var (
41+
factory *fixtures.Factory
42+
fdbCluster *fixtures.HaFdbCluster
43+
testOptions *fixtures.FactoryOptions
44+
)
45+
46+
func init() {
47+
testOptions = fixtures.InitFlags()
48+
}
49+
50+
var _ = BeforeSuite(func() {
51+
factory = fixtures.CreateFactory(testOptions)
52+
fdbCluster = factory.CreateFdbHaCluster(fixtures.DefaultClusterConfigWithHaMode(fixtures.HaFourZoneSingleSat, false), factory.GetClusterOptions()...)
53+
54+
// In order to test the robustness of the operator we try to kill the operator Pods every minute.
55+
if factory.ChaosTestsEnabled() {
56+
for _, cluster := range fdbCluster.GetAllClusters() {
57+
factory.ScheduleInjectPodKill(
58+
fixtures.GetOperatorSelector(cluster.Namespace()),
59+
"*/2 * * * *",
60+
chaosmesh.OneMode,
61+
)
62+
}
63+
}
64+
})
65+
66+
var _ = AfterSuite(func() {
67+
if CurrentSpecReport().Failed() {
68+
log.Printf("failed due to %s", CurrentSpecReport().FailureMessage())
69+
}
70+
factory.Shutdown()
71+
})
72+
73+
// This test suite contains destructive tests which will break the cluster after the test is done.
74+
// These tests are currently not run by our CI, as those tests might be flaky. The intention of these tests
75+
// is to simplify the manual testing of different scenarios that could lead to data loss.
76+
var _ = Describe("Operator HA Failure tests", Label("e2e"), func() {
77+
When("simulating data-loss during fail-over", func() {
78+
var experiments []*fixtures.ChaosMeshExperiment
79+
var keyValues []fixtures.KeyValue
80+
var prefix byte = 'a'
81+
82+
BeforeEach(func() {
83+
if factory.ChaosTestsEnabled() {
84+
Skip("chaos tests are required for this test suite")
85+
}
86+
// The idea of this test case is to partition the primary and primary satellite from the remote and remote
87+
// satellite. Then the cluster will be loaded with data, which will work because the primary and primary
88+
// satellite are available. Before deleting the partition we destroy the primary and the primary satellite
89+
// and then we forcefully recover the remote side, which will cause data loss, as the mutation are not synced
90+
// from the primary side to the remote (partitioned).
91+
primary := fdbCluster.GetPrimary()
92+
primarySatellite := fdbCluster.GetPrimarySatellite()
93+
remote := fdbCluster.GetRemote()
94+
remoteSatellite := fdbCluster.GetRemoteSatellite()
95+
experiments = make([]*fixtures.ChaosMeshExperiment, 0, 2)
96+
// Inject a partition between primary and the remote + remote satellite
97+
experiments = append(experiments, factory.InjectPartitionBetween(
98+
chaosmesh.PodSelectorSpec{
99+
GenericSelectorSpec: chaosmesh.GenericSelectorSpec{
100+
Namespaces: []string{primary.Namespace()},
101+
LabelSelectors: primary.GetCachedCluster().GetMatchLabels(),
102+
},
103+
},
104+
chaosmesh.PodSelectorSpec{
105+
GenericSelectorSpec: chaosmesh.GenericSelectorSpec{
106+
Namespaces: []string{
107+
remote.Namespace(),
108+
remoteSatellite.Namespace(),
109+
},
110+
},
111+
}))
112+
113+
// Inject a partition between primary satellite and the remote + remote satellite
114+
experiments = append(experiments, factory.InjectPartitionBetween(
115+
chaosmesh.PodSelectorSpec{
116+
GenericSelectorSpec: chaosmesh.GenericSelectorSpec{
117+
Namespaces: []string{primarySatellite.Namespace()},
118+
LabelSelectors: primarySatellite.GetCachedCluster().GetMatchLabels(),
119+
},
120+
},
121+
chaosmesh.PodSelectorSpec{
122+
GenericSelectorSpec: chaosmesh.GenericSelectorSpec{
123+
Namespaces: []string{
124+
remote.Namespace(),
125+
remoteSatellite.Namespace(),
126+
},
127+
},
128+
}))
129+
130+
time.Sleep(10 * time.Second)
131+
132+
keyValues = primary.GenerateRandomValues(10, prefix)
133+
primary.WriteKeyValuesWithTimeout(keyValues, 120)
134+
// Destroy primary and primary satellite (should have mutations that are not present in the remote side).
135+
primary.SetSkipReconciliation(true)
136+
primarySatellite.SetSkipReconciliation(true)
137+
// We also destroy the remote satellite, it shouldn't matter in this case as the remote satellite
138+
// has no data anyways. But the idea here is to reduce the possible interaction between the remote
139+
// and the remote satellite during the forced fail-over.
140+
remoteSatellite.SetSkipReconciliation(true)
141+
142+
// We could probably simulate that with the suspend command, but destroying the pods is a more robust solution.
143+
var wg errgroup.Group
144+
log.Println("Delete Pods in primary")
145+
wg.Go(func() error {
146+
for _, pod := range primary.GetPods().Items {
147+
factory.DeletePod(&pod)
148+
}
149+
150+
return nil
151+
})
152+
153+
log.Println("Delete Pods in primary satellite")
154+
wg.Go(func() error {
155+
for _, pod := range primarySatellite.GetPods().Items {
156+
factory.DeletePod(&pod)
157+
}
158+
159+
return nil
160+
})
161+
162+
log.Println("Delete Pods in remote satellite")
163+
wg.Go(func() error {
164+
for _, pod := range remoteSatellite.GetPods().Items {
165+
factory.DeletePod(&pod)
166+
}
167+
168+
return nil
169+
})
170+
171+
Expect(wg.Wait()).NotTo(HaveOccurred())
172+
// Wait a short amount of time to let the cluster see that the primary and primary satellite is down.
173+
time.Sleep(30 * time.Second)
174+
175+
// Ensure the cluster is unavailable.
176+
Eventually(func() bool {
177+
return remote.GetStatus().Client.DatabaseStatus.Available
178+
}).WithTimeout(2 * time.Minute).WithPolling(1 * time.Second).Should(BeFalse())
179+
})
180+
181+
AfterEach(func() {
182+
for _, experiment := range experiments {
183+
factory.DeleteChaosMeshExperimentSafe(experiment)
184+
}
185+
})
186+
187+
It("should fail-over and cause data loss", func() {
188+
remote := fdbCluster.GetRemote()
189+
// Pick one operator pod and execute the recovery command
190+
operatorPod := factory.RandomPickOnePod(factory.GetOperatorPods(remote.Namespace()).Items)
191+
log.Println("operatorPod:", operatorPod.Name)
192+
stdout, stderr, err := factory.ExecuteCmdOnPod(context.Background(), &operatorPod, "manager", fmt.Sprintf("kubectl-fdb -n %s recover-multi-region-cluster --version-check=false --wait=false %s", remote.Namespace(), remote.Name()), false)
193+
log.Println("stdout:", stdout, "stderr:", stderr)
194+
Expect(err).NotTo(HaveOccurred())
195+
196+
// Ensure the cluster is available again.
197+
Eventually(func() bool {
198+
return remote.GetStatus().Client.DatabaseStatus.Available
199+
}).WithTimeout(2 * time.Minute).WithPolling(1 * time.Second).Should(BeTrue())
200+
201+
// Ensure we lost some data.
202+
Expect(remote.GetRange([]byte{prefix}, 25, 60)).Should(BeEmpty())
203+
})
204+
})
205+
})
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
/*
2+
* suite_test.go
3+
*
4+
* This source file is part of the FoundationDB open source project
5+
*
6+
* Copyright 2018-2024 Apple Inc. and the FoundationDB project authors
7+
*
8+
* Licensed under the Apache License, Version 2.0 (the "License");
9+
* you may not use this file except in compliance with the License.
10+
* You may obtain a copy of the License at
11+
*
12+
* http://www.apache.org/licenses/LICENSE-2.0
13+
*
14+
* Unless required by applicable law or agreed to in writing, software
15+
* distributed under the License is distributed on an "AS IS" BASIS,
16+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17+
* See the License for the specific language governing permissions and
18+
* limitations under the License.
19+
*/
20+
21+
package operatorhafailure
22+
23+
import (
24+
"testing"
25+
"time"
26+
27+
"github.com/FoundationDB/fdb-kubernetes-operator/e2e/fixtures"
28+
"github.com/onsi/gomega"
29+
)
30+
31+
func TestOperatorHA(t *testing.T) {
32+
gomega.SetDefaultEventuallyTimeout(10 * time.Second)
33+
fixtures.SetTestSuiteName("operator-ha-failure")
34+
fixtures.RunGinkgoTests(t, "Operator HA failure test suite")
35+
}

0 commit comments

Comments
 (0)