Skip to content

Commit 4653430

Browse files
authored
Fix the e2e test case for the new FDB version that supports to remove old tester processes automatically (#1955)
1 parent f150c0a commit 4653430

File tree

5 files changed

+88
-29
lines changed

5 files changed

+88
-29
lines changed

api/v1beta2/foundationdb_version.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -232,6 +232,16 @@ func (version Version) SupportsLocalityBasedExclusions() bool {
232232
return version.IsAtLeast(Versions.SupportsLocalityBasedExclusions)
233233
}
234234

235+
// AutomaticallyRemovesDeadTesterProcesses returns true if the FDB version automatically removes old tester processes
236+
// from the list of processes.
237+
func (version Version) AutomaticallyRemovesDeadTesterProcesses() bool {
238+
if version.IsProtocolCompatible(Version{Major: 7, Minor: 3, Patch: 0}) {
239+
return version.IsAtLeast(Version{Major: 7, Minor: 1, Patch: 35})
240+
}
241+
242+
return version.IsAtLeast(Version{Major: 7, Minor: 1, Patch: 55})
243+
}
244+
235245
// Versions provides a shorthand for known versions.
236246
// This is only to be used in testing.
237247
var Versions = struct {

controllers/bounce_processes.go

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ func (bounceProcesses) reconcile(_ context.Context, r *FoundationDBClusterReconc
7878
// Check if the status contains unreachable tester processes. In this case the cluster controller must be restarted.
7979
// Otherwise the status will contain a message with "status_incomplete" and "unreachable_processes". Those messages
8080
// could block further actions like the check if a process is exclude and doesn't serve any roles.
81-
clusterControllerAddress := checkIfClusterControllerNeedsRestart(status)
81+
clusterControllerAddress := checkIfClusterControllerNeedsRestart(logger, cluster, status)
8282
if clusterControllerAddress != nil {
8383
logger.Info("found unreachable tester processes in status which requires a cluster controller restart")
8484
// Adding the same address twice is not a problem for the kill command, so we can just append the returned address.
@@ -351,8 +351,20 @@ func getAddressesForUpgrade(logger logr.Logger, r *FoundationDBClusterReconciler
351351
// this might be required. One case is when at least on tester process is running in the cluster and that tester process
352352
// fails. Currently this leads to the cluster controller reporting unreachable processes and the status incomplete message.
353353
// Having those messages in the cluster's machine-readable status could block some operations of the operator and the
354-
// solution to that is to restart the cluster controller process.
355-
func checkIfClusterControllerNeedsRestart(status *fdbv1beta2.FoundationDBStatus) *fdbv1beta2.ProcessAddress {
354+
// solution to that is to restart the cluster controller process. If the FDB version supports to remove the old tester
355+
// worker automatically this step will return no processes to be restarted.
356+
func checkIfClusterControllerNeedsRestart(logger logr.Logger, cluster *fdbv1beta2.FoundationDBCluster, status *fdbv1beta2.FoundationDBStatus) *fdbv1beta2.ProcessAddress {
357+
runningVersion, err := fdbv1beta2.ParseFdbVersion(cluster.GetRunningVersion())
358+
if err != nil {
359+
logger.Error(err, "could not parse running version in checkIfClusterControllerNeedsRestart")
360+
return nil
361+
}
362+
363+
// If the cluster controller automatically removes the dead tester processes, the operator can skip any further work.
364+
if runningVersion.AutomaticallyRemovesDeadTesterProcesses() {
365+
return nil
366+
}
367+
356368
// If the status contains no cluster messages we can skip further check.
357369
if len(status.Cluster.Messages) == 0 {
358370
return nil

controllers/bounce_processes_test.go

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -723,6 +723,30 @@ var _ = Describe("bounceProcesses", func() {
723723
Expect(adminClient.KilledAddresses).To(BeEmpty())
724724
})
725725
})
726+
727+
When("the FDB version automatically removes old tester processes", func() {
728+
var previousVersion string
729+
730+
BeforeEach(func() {
731+
previousVersion = cluster.Status.RunningVersion
732+
cluster.Spec.Version = "7.1.57"
733+
cluster.Status.RunningVersion = "7.1.57"
734+
})
735+
736+
AfterEach(func() {
737+
cluster.Status.RunningVersion = previousVersion
738+
cluster.Spec.Version = previousVersion
739+
})
740+
741+
It("should not requeue", func() {
742+
Expect(err).NotTo(HaveOccurred())
743+
Expect(requeue).To(BeNil())
744+
})
745+
746+
It("should not kill the cluster controller", func() {
747+
Expect(adminClient.KilledAddresses).To(BeEmpty())
748+
})
749+
})
726750
})
727751

728752
When("the unreachable processes include no tester processes", func() {
@@ -774,7 +798,7 @@ var _ = Describe("bounceProcesses", func() {
774798
})
775799
})
776800

777-
When("the cluster message doesn not contain unreachable processes", func() {
801+
When("the cluster message does not contain unreachable processes", func() {
778802
BeforeEach(func() {
779803
adminClient, err = mock.NewMockAdminClientUncast(cluster, k8sClient)
780804
Expect(err).NotTo(HaveOccurred())

e2e/Makefile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,9 @@ TIMEOUT?=168h
1010
CLUSTER_NAME?=
1111
NAMESPACE?=
1212
CONTEXT?=
13-
FDB_VERSION?=7.1.53
13+
FDB_VERSION?=7.1.57
1414
# This will be the version used for upgrade tests.
15-
NEXT_FDB_VERSION?=7.3.29
15+
NEXT_FDB_VERSION?=7.3.33
1616
## Expectation is that you are running standard build image which generates both regular and debug (Symbols) images.
1717
FDB_IMAGE?=foundationdb/foundationdb:$(FDB_VERSION)
1818
SIDECAR_IMAGE?=foundationdb/foundationdb-kubernetes-sidecar:$(FDB_VERSION)-1

e2e/test_operator/operator_test.go

Lines changed: 36 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1750,7 +1750,17 @@ var _ = Describe("Operator", Label("e2e", "pr"), func() {
17501750
})
17511751

17521752
When("running with tester processes", func() {
1753+
fdbAutomaticallyRemoveOldTester := false
1754+
17531755
BeforeEach(func() {
1756+
fdbVersion := fdbCluster.GetCluster().GetRunningVersion()
1757+
version, err := fdbv1beta2.ParseFdbVersion(fdbVersion)
1758+
Expect(err).NotTo(HaveOccurred())
1759+
1760+
if version.AutomaticallyRemovesDeadTesterProcesses() {
1761+
fdbAutomaticallyRemoveOldTester = true
1762+
}
1763+
17541764
// We will be restarting the CC, so we can ignore this check.
17551765
availabilityCheck = false
17561766
spec := fdbCluster.GetCluster().Spec.DeepCopy()
@@ -1789,25 +1799,27 @@ var _ = Describe("Operator", Label("e2e", "pr"), func() {
17891799
factory.Delete(&pod)
17901800
}
17911801

1792-
// Wait until the cluster shows the unreachable process.
1793-
Eventually(func() []string {
1794-
status := fdbCluster.GetStatus()
1802+
if !fdbAutomaticallyRemoveOldTester {
1803+
// Wait until the cluster shows the unreachable process.
1804+
Eventually(func() []string {
1805+
status := fdbCluster.GetStatus()
17951806

1796-
messages := make([]string, 0, len(status.Cluster.Messages))
1797-
for _, message := range status.Cluster.Messages {
1798-
messages = append(messages, message.Name)
1799-
}
1807+
messages := make([]string, 0, len(status.Cluster.Messages))
1808+
for _, message := range status.Cluster.Messages {
1809+
messages = append(messages, message.Name)
1810+
}
18001811

1801-
log.Println("current messages:", messages)
1812+
log.Println("current messages:", messages)
18021813

1803-
return messages
1804-
}).WithPolling(1 * time.Second).WithTimeout(2 * time.Minute).MustPassRepeatedly(5).Should(ContainElements("status_incomplete", "unreachable_processes"))
1814+
return messages
1815+
}).WithPolling(1 * time.Second).WithTimeout(2 * time.Minute).MustPassRepeatedly(5).Should(ContainElements("status_incomplete", "unreachable_processes"))
1816+
}
18051817

18061818
// Let the operator fix the issue.
18071819
Expect(fdbCluster.SetSkipReconciliation(false)).NotTo(HaveOccurred())
18081820
})
18091821

1810-
It("should restart the cluster controller", func() {
1822+
It("should show the status without any messages", func() {
18111823
// The operator should be restarting the cluster controller and this should clean the unreachable_processes
18121824
Eventually(func() []string {
18131825
status := fdbCluster.GetStatus()
@@ -1854,19 +1866,21 @@ var _ = Describe("Operator", Label("e2e", "pr"), func() {
18541866
chaosmesh.From,
18551867
)
18561868

1857-
// Wait until the cluster shows the unreachable process.
1858-
Eventually(func() []string {
1859-
status := fdbCluster.GetStatus()
1869+
if !fdbAutomaticallyRemoveOldTester {
1870+
// Wait until the cluster shows the unreachable process.
1871+
Eventually(func() []string {
1872+
status := fdbCluster.GetStatus()
18601873

1861-
messages := make([]string, 0, len(status.Cluster.Messages))
1862-
for _, message := range status.Cluster.Messages {
1863-
messages = append(messages, message.Name)
1864-
}
1874+
messages := make([]string, 0, len(status.Cluster.Messages))
1875+
for _, message := range status.Cluster.Messages {
1876+
messages = append(messages, message.Name)
1877+
}
18651878

1866-
log.Println("current messages:", messages)
1879+
log.Println("current messages:", messages)
18671880

1868-
return messages
1869-
}).WithPolling(1 * time.Second).WithTimeout(2 * time.Minute).MustPassRepeatedly(5).Should(ContainElements("status_incomplete", "unreachable_processes"))
1881+
return messages
1882+
}).WithPolling(1 * time.Second).WithTimeout(2 * time.Minute).MustPassRepeatedly(5).Should(ContainElements("status_incomplete", "unreachable_processes"))
1883+
}
18701884

18711885
// Let the operator fix the issue.
18721886
Expect(fdbCluster.SetSkipReconciliation(false)).NotTo(HaveOccurred())
@@ -1876,8 +1890,7 @@ var _ = Describe("Operator", Label("e2e", "pr"), func() {
18761890
factory.DeleteChaosMeshExperimentSafe(exp)
18771891
})
18781892

1879-
// TODO what will be the status here?
1880-
It("should restart the cluster controller", func() {
1893+
It("should show the status without any messages", func() {
18811894
// The operator should be restarting the cluster controller and this should clean the unreachable_processes
18821895
Eventually(func() []string {
18831896
status := fdbCluster.GetStatus()

0 commit comments

Comments
 (0)