Skip to content

Commit b608b2b

Browse files
authored
Add a test case to run the exclusion in a multi-region cluster for a bigger cluster with injected latencies (#2183)
1 parent ff924de commit b608b2b

File tree

1 file changed

+108
-0
lines changed

1 file changed

+108
-0
lines changed

e2e/test_operator_ha/operator_ha_test.go

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -313,5 +313,113 @@ var _ = Describe("Operator HA tests", Label("e2e", "pr"), func() {
313313
factory.DeleteDataLoader(fdbCluster.GetPrimary())
314314
})
315315
})
316+
317+
PWhen("when a remote side has network latency issues and a pod gets replaced", func() {
318+
/*
319+
320+
TODO (johscheuer): This test should be running with a bigger multi-region cluster e.g.:
321+
322+
config := fixtures.DefaultClusterConfigWithHaMode(fixtures.HaFourZoneSingleSat, false)
323+
config.StorageServerPerPod = 8
324+
config.MachineCount = 10
325+
config.DisksPerMachine = 8
326+
327+
*/
328+
var experiment *fixtures.ChaosMeshExperiment
329+
330+
BeforeEach(func() {
331+
dcID := fdbCluster.GetRemote().GetCluster().Spec.DataCenter
332+
333+
status := fdbCluster.GetPrimary().GetStatus()
334+
335+
var processGroupID fdbv1beta2.ProcessGroupID
336+
for _, process := range status.Cluster.Processes {
337+
dc, ok := process.Locality[fdbv1beta2.FDBLocalityDCIDKey]
338+
if !ok || dc != dcID {
339+
continue
340+
}
341+
342+
var isLog bool
343+
for _, role := range process.Roles {
344+
if role.Role == "log" {
345+
isLog = true
346+
break
347+
}
348+
}
349+
350+
if !isLog {
351+
continue
352+
}
353+
354+
processGroupID = fdbv1beta2.ProcessGroupID(process.Locality[fdbv1beta2.FDBLocalityInstanceIDKey])
355+
break
356+
}
357+
358+
log.Println("Will inject chaos into", processGroupID, "and replace it")
359+
var replacedPod corev1.Pod
360+
for _, pod := range fdbCluster.GetRemote().GetLogPods().Items {
361+
if fixtures.GetProcessGroupID(pod) != processGroupID {
362+
continue
363+
}
364+
365+
replacedPod = pod
366+
break
367+
}
368+
369+
log.Println("Inject latency chaos")
370+
experiment = factory.InjectNetworkLatency(
371+
chaosmesh.PodSelectorSpec{
372+
GenericSelectorSpec: chaosmesh.GenericSelectorSpec{
373+
Namespaces: []string{fdbCluster.GetRemote().Namespace()},
374+
LabelSelectors: fdbCluster.GetRemote().GetCachedCluster().GetMatchLabels(),
375+
},
376+
},
377+
chaosmesh.PodSelectorSpec{
378+
GenericSelectorSpec: chaosmesh.GenericSelectorSpec{
379+
Namespaces: []string{
380+
fdbCluster.GetPrimary().Namespace(),
381+
fdbCluster.GetPrimarySatellite().Namespace(),
382+
fdbCluster.GetRemote().Namespace(),
383+
fdbCluster.GetRemoteSatellite().Namespace(),
384+
},
385+
ExpressionSelectors: []metav1.LabelSelectorRequirement{
386+
{
387+
Key: fdbv1beta2.FDBClusterLabel,
388+
Operator: metav1.LabelSelectorOpExists,
389+
},
390+
},
391+
},
392+
}, chaosmesh.Both,
393+
&chaosmesh.DelaySpec{
394+
Latency: "250ms",
395+
Correlation: "100",
396+
Jitter: "0",
397+
})
398+
399+
// TODO (johscheuer): Allow to have this as a long running task until the test is done.
400+
factory.CreateDataLoaderIfAbsentWithWait(fdbCluster.GetPrimary(), false)
401+
402+
time.Sleep(1 * time.Minute)
403+
log.Println("replacedPod", replacedPod.Name, "useLocalitiesForExclusion", fdbCluster.GetPrimary().GetCluster().UseLocalitiesForExclusion())
404+
fdbCluster.GetRemote().ReplacePod(replacedPod, true)
405+
})
406+
407+
It("should exclude and remove the pod", func() {
408+
Eventually(func() []fdbv1beta2.ExcludedServers {
409+
status := fdbCluster.GetPrimary().GetStatus()
410+
excludedServers := status.Cluster.DatabaseConfiguration.ExcludedServers
411+
log.Println("excludedServers", excludedServers)
412+
return excludedServers
413+
}).WithTimeout(15 * time.Minute).WithPolling(1 * time.Second).Should(BeEmpty())
414+
})
415+
416+
AfterEach(func() {
417+
Expect(fdbCluster.GetRemote().ClearProcessGroupsToRemove()).NotTo(HaveOccurred())
418+
factory.DeleteChaosMeshExperimentSafe(experiment)
419+
// Making sure we included back all the process groups after exclusion is complete.
420+
Expect(fdbCluster.GetPrimary().GetStatus().Cluster.DatabaseConfiguration.ExcludedServers).To(BeEmpty())
421+
factory.DeleteDataLoader(fdbCluster.GetPrimary())
422+
})
423+
})
316424
})
317425
})

0 commit comments

Comments
 (0)