Skip to content

Commit 5b2a6e9

Browse files
authored
Make sure to skip process groups that are under maintenance (#2004)
* Make sure to skip process groups that are under maintenance * Fix e2e test case with new assumptions
1 parent c958293 commit 5b2a6e9

File tree

4 files changed

+99
-30
lines changed

4 files changed

+99
-30
lines changed

controllers/update_status.go

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,7 @@ func (updateStatus) reconcile(ctx context.Context, r *FoundationDBClusterReconci
111111
clusterStatus.RequiredAddresses.NonTLS = true
112112
}
113113

114+
var currentMaintenanceZone fdbv1beta2.FaultDomain
114115
if databaseStatus != nil {
115116
for _, coordinator := range databaseStatus.Client.Coordinators.Coordinators {
116117
address, err := fdbv1beta2.ParseProcessAddress(coordinator.Address.String())
@@ -129,6 +130,7 @@ func (updateStatus) reconcile(ctx context.Context, r *FoundationDBClusterReconci
129130
clusterStatus.Health.Healthy = databaseStatus.Client.DatabaseStatus.Healthy
130131
clusterStatus.Health.FullReplication = databaseStatus.Cluster.FullReplication
131132
clusterStatus.Health.DataMovementPriority = databaseStatus.Cluster.Data.MovingData.HighestPriority
133+
currentMaintenanceZone = databaseStatus.Cluster.MaintenanceZone
132134
}
133135

134136
cluster.Status.RequiredAddresses = clusterStatus.RequiredAddresses
@@ -144,7 +146,7 @@ func (updateStatus) reconcile(ctx context.Context, r *FoundationDBClusterReconci
144146
return &requeue{curError: fmt.Errorf("update_status skipped due to error in refreshProcessGroupStatus: %w", err)}
145147
}
146148

147-
err = validateProcessGroups(ctx, r, cluster, &clusterStatus, processMap, configMap, pvcs, logger)
149+
err = validateProcessGroups(ctx, r, cluster, &clusterStatus, processMap, configMap, pvcs, logger, currentMaintenanceZone)
148150
if err != nil {
149151
return &requeue{curError: fmt.Errorf("update_status skipped due to error in validateProcessGroups: %w", err)}
150152
}
@@ -432,7 +434,7 @@ func checkAndSetProcessStatus(logger logr.Logger, r *FoundationDBClusterReconcil
432434
}
433435

434436
// Validate and set progressGroup's status
435-
func validateProcessGroups(ctx context.Context, r *FoundationDBClusterReconciler, cluster *fdbv1beta2.FoundationDBCluster, status *fdbv1beta2.FoundationDBClusterStatus, processMap map[fdbv1beta2.ProcessGroupID][]fdbv1beta2.FoundationDBStatusProcessInfo, configMap *corev1.ConfigMap, pvcs *corev1.PersistentVolumeClaimList, logger logr.Logger) error {
437+
func validateProcessGroups(ctx context.Context, r *FoundationDBClusterReconciler, cluster *fdbv1beta2.FoundationDBCluster, status *fdbv1beta2.FoundationDBClusterStatus, processMap map[fdbv1beta2.ProcessGroupID][]fdbv1beta2.FoundationDBStatusProcessInfo, configMap *corev1.ConfigMap, pvcs *corev1.PersistentVolumeClaimList, logger logr.Logger, maintenanceZone fdbv1beta2.FaultDomain) error {
436438
processGroupsWithoutExclusion := make(map[fdbv1beta2.ProcessGroupID]fdbv1beta2.None, len(cluster.Spec.ProcessGroupsToRemoveWithoutExclusion))
437439
for _, processGroupID := range cluster.Spec.ProcessGroupsToRemoveWithoutExclusion {
438440
processGroupsWithoutExclusion[processGroupID] = fdbv1beta2.None{}
@@ -460,6 +462,13 @@ func validateProcessGroups(ctx context.Context, r *FoundationDBClusterReconciler
460462
}
461463
}
462464

465+
// If a process group is under maintenance we want to skip performing checks on the process group to prevent
466+
// misleading conditions that could lead to a replacement race condition.
467+
if processGroup.IsUnderMaintenance(maintenanceZone) {
468+
logger.Info("skip updating the process group as the process group is currently under maintenance", "faultDomain", processGroup.FaultDomain, "maintenanceZone", maintenanceZone)
469+
continue
470+
}
471+
463472
pod, podError := r.PodLifecycleManager.GetPod(ctx, r, cluster, processGroup.GetPodName(cluster))
464473
if podError != nil {
465474
// If the process group is not being removed and the Pod is not set we need to put it into
@@ -564,7 +573,7 @@ func validateProcessGroups(ctx context.Context, r *FoundationDBClusterReconciler
564573
return nil
565574
}
566575

567-
// validateProcessGroup runs specific checks for the status of an process group.
576+
// validateProcessGroup runs specific checks for the status of a process group.
568577
// returns failing, incorrect, error
569578
func validateProcessGroup(ctx context.Context, r *FoundationDBClusterReconciler, cluster *fdbv1beta2.FoundationDBCluster,
570579
pod *corev1.Pod, currentPVC *corev1.PersistentVolumeClaim, configMapHash string, processGroupStatus *fdbv1beta2.ProcessGroupStatus,
@@ -656,7 +665,6 @@ func validateProcessGroup(ctx context.Context, r *FoundationDBClusterReconciler,
656665

657666
processGroupStatus.UpdateCondition(fdbv1beta2.PodFailing, failing)
658667
processGroupStatus.UpdateCondition(fdbv1beta2.PodPending, false)
659-
660668
if !disableTaintFeature {
661669
err = updateTaintCondition(ctx, r, cluster, pod, processGroupStatus, logger.WithValues("Pod", pod.Name, "nodeName", pod.Spec.NodeName, "processGroupID", processGroupStatus.ProcessGroupID))
662670
if err != nil {

controllers/update_status_test.go

Lines changed: 31 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -329,7 +329,7 @@ var _ = Describe("update_status", func() {
329329

330330
When("a process group is fine", func() {
331331
It("should not get any condition assigned", func() {
332-
err := validateProcessGroups(context.TODO(), clusterReconciler, cluster, &cluster.Status, processMap, configMap, allPvcs, logger)
332+
err := validateProcessGroups(context.TODO(), clusterReconciler, cluster, &cluster.Status, processMap, configMap, allPvcs, logger, "")
333333
Expect(err).NotTo(HaveOccurred())
334334
Expect(len(cluster.Status.ProcessGroups)).To(BeNumerically(">", 4))
335335
processGroup := cluster.Status.ProcessGroups[len(cluster.Status.ProcessGroups)-4]
@@ -346,7 +346,7 @@ var _ = Describe("update_status", func() {
346346
It("should get a condition assigned", func() {
347347
dummyPod := &corev1.Pod{}
348348
Expect(k8sClient.Get(context.TODO(), ctrlClient.ObjectKeyFromObject(storagePod), dummyPod)).To(HaveOccurred())
349-
err := validateProcessGroups(context.TODO(), clusterReconciler, cluster, &cluster.Status, processMap, configMap, allPvcs, logger)
349+
err := validateProcessGroups(context.TODO(), clusterReconciler, cluster, &cluster.Status, processMap, configMap, allPvcs, logger, "")
350350
Expect(err).NotTo(HaveOccurred())
351351

352352
missingProcesses := fdbv1beta2.FilterByCondition(cluster.Status.ProcessGroups, fdbv1beta2.MissingPod, false)
@@ -365,7 +365,7 @@ var _ = Describe("update_status", func() {
365365
})
366366

367367
It("should get the ProcessIsMarkedAsExcluded condition", func() {
368-
err := validateProcessGroups(context.TODO(), clusterReconciler, cluster, &cluster.Status, processMap, configMap, allPvcs, logger)
368+
err := validateProcessGroups(context.TODO(), clusterReconciler, cluster, &cluster.Status, processMap, configMap, allPvcs, logger, "")
369369
Expect(err).NotTo(HaveOccurred())
370370
Expect(len(cluster.Status.ProcessGroups)).To(BeNumerically(">", 4))
371371
processGroup := cluster.Status.ProcessGroups[len(cluster.Status.ProcessGroups)-4]
@@ -381,7 +381,7 @@ var _ = Describe("update_status", func() {
381381
})
382382

383383
It("should get a condition assigned", func() {
384-
err := validateProcessGroups(context.TODO(), clusterReconciler, cluster, &cluster.Status, processMap, configMap, allPvcs, logger)
384+
err := validateProcessGroups(context.TODO(), clusterReconciler, cluster, &cluster.Status, processMap, configMap, allPvcs, logger, "")
385385
Expect(err).NotTo(HaveOccurred())
386386

387387
incorrectProcesses := fdbv1beta2.FilterByCondition(cluster.Status.ProcessGroups, fdbv1beta2.IncorrectCommandLine, false)
@@ -399,7 +399,7 @@ var _ = Describe("update_status", func() {
399399
})
400400

401401
It("should get a condition assigned", func() {
402-
err := validateProcessGroups(context.TODO(), clusterReconciler, cluster, &cluster.Status, processMap, configMap, allPvcs, logger)
402+
err := validateProcessGroups(context.TODO(), clusterReconciler, cluster, &cluster.Status, processMap, configMap, allPvcs, logger, "")
403403
Expect(err).NotTo(HaveOccurred())
404404

405405
incorrectProcesses := fdbv1beta2.FilterByCondition(cluster.Status.ProcessGroups, fdbv1beta2.IncorrectCommandLine, false)
@@ -420,7 +420,7 @@ var _ = Describe("update_status", func() {
420420
})
421421

422422
It("should get a condition assigned", func() {
423-
err := validateProcessGroups(context.TODO(), clusterReconciler, cluster, &cluster.Status, processMap, configMap, allPvcs, logger)
423+
err := validateProcessGroups(context.TODO(), clusterReconciler, cluster, &cluster.Status, processMap, configMap, allPvcs, logger, "")
424424
Expect(err).NotTo(HaveOccurred())
425425

426426
missingProcesses := fdbv1beta2.FilterByCondition(cluster.Status.ProcessGroups, fdbv1beta2.MissingProcesses, false)
@@ -434,7 +434,7 @@ var _ = Describe("update_status", func() {
434434

435435
When("no processes are provided in the process map", func() {
436436
It("should not get a condition assigned", func() {
437-
err := validateProcessGroups(context.TODO(), clusterReconciler, cluster, &cluster.Status, map[fdbv1beta2.ProcessGroupID][]fdbv1beta2.FoundationDBStatusProcessInfo{}, configMap, allPvcs, logger)
437+
err := validateProcessGroups(context.TODO(), clusterReconciler, cluster, &cluster.Status, map[fdbv1beta2.ProcessGroupID][]fdbv1beta2.FoundationDBStatusProcessInfo{}, configMap, allPvcs, logger, "")
438438
Expect(err).NotTo(HaveOccurred())
439439

440440
missingProcesses := fdbv1beta2.FilterByCondition(cluster.Status.ProcessGroups, fdbv1beta2.MissingProcesses, false)
@@ -456,7 +456,7 @@ var _ = Describe("update_status", func() {
456456
})
457457

458458
It("should get a condition assigned", func() {
459-
err := validateProcessGroups(context.TODO(), clusterReconciler, cluster, &cluster.Status, processMap, configMap, allPvcs, logger)
459+
err := validateProcessGroups(context.TODO(), clusterReconciler, cluster, &cluster.Status, processMap, configMap, allPvcs, logger, "")
460460
Expect(err).NotTo(HaveOccurred())
461461

462462
incorrectPods := fdbv1beta2.FilterByCondition(cluster.Status.ProcessGroups, fdbv1beta2.IncorrectPodSpec, false)
@@ -480,7 +480,7 @@ var _ = Describe("update_status", func() {
480480
})
481481

482482
It("should get a condition assigned", func() {
483-
err := validateProcessGroups(context.TODO(), clusterReconciler, cluster, &cluster.Status, processMap, configMap, allPvcs, logger)
483+
err := validateProcessGroups(context.TODO(), clusterReconciler, cluster, &cluster.Status, processMap, configMap, allPvcs, logger, "")
484484
Expect(err).NotTo(HaveOccurred())
485485

486486
missingProcesses := fdbv1beta2.FilterByCondition(cluster.Status.ProcessGroups, fdbv1beta2.PodFailing, false)
@@ -501,7 +501,7 @@ var _ = Describe("update_status", func() {
501501
})
502502

503503
It("should get a condition assigned", func() {
504-
err := validateProcessGroups(context.TODO(), clusterReconciler, cluster, &cluster.Status, processMap, configMap, allPvcs, logger)
504+
err := validateProcessGroups(context.TODO(), clusterReconciler, cluster, &cluster.Status, processMap, configMap, allPvcs, logger, "")
505505
Expect(err).NotTo(HaveOccurred())
506506

507507
failingPods := fdbv1beta2.FilterByCondition(cluster.Status.ProcessGroups, fdbv1beta2.PodFailing, false)
@@ -517,12 +517,11 @@ var _ = Describe("update_status", func() {
517517
When("the pod is failed", func() {
518518
BeforeEach(func() {
519519
storagePod.Status.Phase = corev1.PodFailed
520-
err = k8sClient.Update(context.TODO(), storagePod)
521-
Expect(err).NotTo(HaveOccurred())
520+
Expect(k8sClient.Update(context.TODO(), storagePod)).NotTo(HaveOccurred())
522521
})
523522

524523
It("should get a condition assigned", func() {
525-
err := validateProcessGroups(context.TODO(), clusterReconciler, cluster, &cluster.Status, processMap, configMap, allPvcs, logger)
524+
err := validateProcessGroups(context.TODO(), clusterReconciler, cluster, &cluster.Status, processMap, configMap, allPvcs, logger, "")
526525
Expect(err).NotTo(HaveOccurred())
527526

528527
failingPods := fdbv1beta2.FilterByCondition(cluster.Status.ProcessGroups, fdbv1beta2.PodFailing, false)
@@ -533,6 +532,20 @@ var _ = Describe("update_status", func() {
533532
Expect(processGroup.ProcessGroupID).To(Equal(storageOneProcessGroupID))
534533
Expect(len(processGroup.ProcessGroupConditions)).To(Equal(1))
535534
})
535+
536+
When("the process group is under maintenance", func() {
537+
It("should not set the conditions", func() {
538+
processGroup := cluster.Status.ProcessGroups[len(cluster.Status.ProcessGroups)-4]
539+
Expect(validateProcessGroups(context.TODO(), clusterReconciler, cluster, &cluster.Status, processMap, configMap, allPvcs, logger, processGroup.FaultDomain)).NotTo(HaveOccurred())
540+
541+
failingPods := fdbv1beta2.FilterByCondition(cluster.Status.ProcessGroups, fdbv1beta2.PodFailing, false)
542+
Expect(failingPods).To(BeEmpty())
543+
544+
Expect(len(cluster.Status.ProcessGroups)).To(BeNumerically(">", 4))
545+
Expect(processGroup.ProcessGroupID).To(Equal(storageOneProcessGroupID))
546+
Expect(len(processGroup.ProcessGroupConditions)).To(BeZero())
547+
})
548+
})
536549
})
537550

538551
When("adding a process group to the ProcessGroupsToRemove list", func() {
@@ -546,7 +559,7 @@ var _ = Describe("update_status", func() {
546559
})
547560

548561
It("should mark the process group for removal", func() {
549-
err := validateProcessGroups(context.TODO(), clusterReconciler, cluster, &cluster.Status, processMap, configMap, allPvcs, logger)
562+
err := validateProcessGroups(context.TODO(), clusterReconciler, cluster, &cluster.Status, processMap, configMap, allPvcs, logger, "")
550563
Expect(err).NotTo(HaveOccurred())
551564

552565
removalCount := 0
@@ -577,7 +590,7 @@ var _ = Describe("update_status", func() {
577590
})
578591

579592
It("should be mark the process group for removal without exclusion", func() {
580-
err := validateProcessGroups(context.TODO(), clusterReconciler, cluster, &cluster.Status, processMap, configMap, allPvcs, logger)
593+
err := validateProcessGroups(context.TODO(), clusterReconciler, cluster, &cluster.Status, processMap, configMap, allPvcs, logger, "")
581594
Expect(err).NotTo(HaveOccurred())
582595

583596
removalCount := 0
@@ -608,7 +621,7 @@ var _ = Describe("update_status", func() {
608621
})
609622

610623
It("should mark the process group as unreachable", func() {
611-
err := validateProcessGroups(context.TODO(), clusterReconciler, cluster, &cluster.Status, processMap, configMap, allPvcs, logger)
624+
err := validateProcessGroups(context.TODO(), clusterReconciler, cluster, &cluster.Status, processMap, configMap, allPvcs, logger, "")
612625
Expect(err).NotTo(HaveOccurred())
613626

614627
unreachableCount := 0
@@ -632,7 +645,7 @@ var _ = Describe("update_status", func() {
632645
})
633646

634647
It("should remove the condition", func() {
635-
err := validateProcessGroups(context.TODO(), clusterReconciler, cluster, &cluster.Status, processMap, configMap, allPvcs, logger)
648+
err := validateProcessGroups(context.TODO(), clusterReconciler, cluster, &cluster.Status, processMap, configMap, allPvcs, logger, "")
636649
Expect(err).NotTo(HaveOccurred())
637650

638651
unreachableCount := 0
@@ -660,7 +673,7 @@ var _ = Describe("update_status", func() {
660673
})
661674

662675
It("should mark the process group as Pod pending", func() {
663-
err := validateProcessGroups(context.TODO(), clusterReconciler, cluster, &cluster.Status, processMap, configMap, allPvcs, logger)
676+
err := validateProcessGroups(context.TODO(), clusterReconciler, cluster, &cluster.Status, processMap, configMap, allPvcs, logger, "")
664677
Expect(err).NotTo(HaveOccurred())
665678

666679
pendingCount := 0

e2e/test_operator/operator_test.go

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1564,9 +1564,6 @@ var _ = Describe("Operator", Label("e2e", "pr"), func() {
15641564

15651565
_, _ = fdbCluster.RunFdbCliCommandInOperator(fmt.Sprintf("maintenance on %s 3600", targetProcessGroup.FaultDomain), false, 30)
15661566

1567-
// Make sure we trigger a reconciliation to speed up the test case and allow the operator to detect the maintenance mode is set.
1568-
fdbCluster.ForceReconcile()
1569-
15701567
// Partition the Pod
15711568
pod := fdbCluster.GetPod(targetProcessGroup.GetPodName(fdbCluster.GetCachedCluster()))
15721569
exp = factory.InjectPartitionBetween(
@@ -1581,21 +1578,21 @@ var _ = Describe("Operator", Label("e2e", "pr"), func() {
15811578

15821579
// Make sure the operator notices that the process is not reporting
15831580
fdbCluster.ForceReconcile()
1581+
})
15841582

1585-
Eventually(func() *int64 {
1583+
It("should not replace the process group", func() {
1584+
Consistently(func() []*fdbv1beta2.ProcessGroupCondition {
15861585
for _, processGroup := range fdbCluster.GetCluster().Status.ProcessGroups {
15871586
if processGroup.ProcessGroupID != targetProcessGroup.ProcessGroupID {
15881587
continue
15891588
}
15901589

1591-
return processGroup.GetConditionTime(fdbv1beta2.MissingProcesses)
1590+
return processGroup.ProcessGroupConditions
15921591
}
15931592

15941593
return nil
1595-
}).WithTimeout(5 * time.Minute).WithPolling(1 * time.Second).ShouldNot(BeNil())
1596-
})
1594+
}).WithTimeout(2 * time.Minute).WithPolling(1 * time.Second).Should(BeEmpty())
15971595

1598-
It("should not replace the process group", func() {
15991596
Consistently(func() bool {
16001597
for _, processGroup := range fdbCluster.GetCluster().Status.ProcessGroups {
16011598
if processGroup.ProcessGroupID != targetProcessGroup.ProcessGroupID {

e2e/test_operator_upgrades/operator_upgrades_test.go

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ Since FoundationDB is version incompatible for major and minor versions and the
2828
*/
2929

3030
import (
31+
"fmt"
3132
"log"
3233
"strings"
3334
"time"
@@ -603,4 +604,54 @@ var _ = Describe("Operator Upgrades", Label("e2e", "pr"), func() {
603604
EntryDescription("Upgrade from %[1]s to %[2]s with a pending pod"),
604605
fixtures.GenerateUpgradeTableEntries(testOptions),
605606
)
607+
608+
DescribeTable(
609+
"one process is under maintenance",
610+
func(beforeVersion string, targetVersion string) {
611+
if fixtures.VersionsAreProtocolCompatible(beforeVersion, targetVersion) {
612+
Skip("this test only affects version incompatible upgrades")
613+
}
614+
615+
clusterSetup(beforeVersion, true)
616+
617+
// Pick a storage process and set it under maintenance
618+
var storageProcessGroupUnderMaintenance *fdbv1beta2.ProcessGroupStatus
619+
for _, processGroup := range fdbCluster.GetCluster().Status.ProcessGroups {
620+
if processGroup.ProcessClass != fdbv1beta2.ProcessClassStorage {
621+
continue
622+
}
623+
624+
storageProcessGroupUnderMaintenance = processGroup
625+
break
626+
}
627+
628+
Expect(storageProcessGroupUnderMaintenance).NotTo(BeNil())
629+
Expect(storageProcessGroupUnderMaintenance.FaultDomain).NotTo(BeEmpty())
630+
log.Println("picked process group", storageProcessGroupUnderMaintenance.ProcessGroupID, "to be under maintenance with fault domain:", storageProcessGroupUnderMaintenance.FaultDomain)
631+
_, _ = fdbCluster.RunFdbCliCommandInOperator(fmt.Sprintf("maintenance on %s 3600", storageProcessGroupUnderMaintenance.FaultDomain), false, 30)
632+
633+
// Make sure the machine-readable status reflects the maintenance mode
634+
Eventually(func() fdbv1beta2.FaultDomain {
635+
return fdbCluster.GetStatus().Cluster.MaintenanceZone
636+
}).WithTimeout(2 * time.Minute).WithPolling(2 * time.Second).MustPassRepeatedly(5).Should(Equal(storageProcessGroupUnderMaintenance.FaultDomain))
637+
638+
// Update the cluster version.
639+
Expect(fdbCluster.UpgradeCluster(targetVersion, false)).NotTo(HaveOccurred())
640+
641+
// Make sure the cluster is not upgraded until the maintenance is removed.
642+
Consistently(func() string {
643+
return fdbCluster.GetCluster().GetRunningVersion()
644+
}).WithTimeout(5 * time.Minute).WithPolling(5 * time.Second).Should(Equal(beforeVersion))
645+
// Turn maintenance off.
646+
_, _ = fdbCluster.RunFdbCliCommandInOperator("maintenance off", false, 30)
647+
648+
// Make sure the cluster is upgraded
649+
fdbCluster.VerifyVersion(targetVersion)
650+
651+
// Make sure the cluster has no data loss.
652+
fdbCluster.EnsureTeamTrackersHaveMinReplicas()
653+
},
654+
EntryDescription("Upgrade from %[1]s to %[2]s"),
655+
fixtures.GenerateUpgradeTableEntries(testOptions),
656+
)
606657
})

0 commit comments

Comments
 (0)