Skip to content

Commit dec876a

Browse files
authored
Allow the pod metadata updater to update the metdadata for Pods marked as removal (#2339)
* Allow the pod metadata updater to update the metdadata for Pods marked as removal
1 parent 46fb55e commit dec876a

File tree

5 files changed

+154
-53
lines changed

5 files changed

+154
-53
lines changed

controllers/update_metadata.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,8 @@ func (updateMetadata) reconcile(
4444
) *requeue {
4545
var shouldRequeue bool
4646
for _, processGroup := range cluster.Status.ProcessGroups {
47-
if processGroup.IsMarkedForRemoval() {
48-
logger.V(1).Info("Ignore process group marked for removal",
47+
if processGroup.GetConditionTime(fdbv1beta2.ResourcesTerminating) != nil {
48+
logger.V(1).Info("Ignore process group that is stuck in terminating state",
4949
"processGroupID", processGroup.ProcessGroupID)
5050
continue
5151
}

controllers/update_pod_config.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ func (updatePodConfig) reconcile(
6363
for _, processGroup := range cluster.Status.ProcessGroups {
6464
curLogger := logger.WithValues("processGroupID", processGroup.ProcessGroupID)
6565
if processGroup.GetConditionTime(fdbv1beta2.ResourcesTerminating) != nil {
66-
curLogger.V(1).Info("Ignore process group marked that is stuck terminating")
66+
curLogger.V(1).Info("Ignore process group that is stuck terminating")
6767
continue
6868
}
6969

controllers/update_status.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -493,6 +493,9 @@ func checkAndSetProcessStatus(
493493

494494
processGroupStatus.UpdateCondition(fdbv1beta2.MissingProcesses, hasMissingProcesses)
495495
processGroupStatus.UpdateCondition(fdbv1beta2.SidecarUnreachable, sidecarUnreachable)
496+
// If the processes are not up and running, we will reset the incorrect command line, e.g. in case that the processes
497+
// were just restarted by the operator.
498+
processGroupStatus.UpdateCondition(fdbv1beta2.IncorrectCommandLine, hasIncorrectCommandLine)
496499
// If the processes are absent, we are not able to determine the state of the processes, therefore we won't change it.
497500
if hasMissingProcesses {
498501
return nil
@@ -520,7 +523,6 @@ func checkAndSetProcessStatus(
520523
if sidecarUnreachable {
521524
return nil
522525
}
523-
processGroupStatus.UpdateCondition(fdbv1beta2.IncorrectCommandLine, hasIncorrectCommandLine)
524526

525527
return nil
526528
}

e2e/test_operator/operator_test.go

Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ import (
3333
"bytes"
3434
"context"
3535
"encoding/binary"
36+
"encoding/json"
3637
"fmt"
3738
"log"
3839
"math"
@@ -2477,4 +2478,116 @@ var _ = Describe("Operator", Label("e2e", "pr"), func() {
24772478
})
24782479
})
24792480
})
2481+
2482+
When(
2483+
"a process group should be removed and deletions are disabled and a new knob is rolled out",
2484+
func() {
2485+
var processGroupID fdbv1beta2.ProcessGroupID
2486+
var initialCustomParameters fdbv1beta2.FoundationDBCustomParameters
2487+
2488+
BeforeEach(func() {
2489+
availabilityCheck = false
2490+
2491+
spec := fdbCluster.GetCluster().Spec.DeepCopy()
2492+
spec.AutomationOptions.DeletionMode = fdbv1beta2.PodUpdateModeNone
2493+
fdbCluster.UpdateClusterSpecWithSpec(spec)
2494+
2495+
storagePod := factory.RandomPickOnePod(fdbCluster.GetStoragePods().Items)
2496+
log.Println("Selected Pod:", storagePod.Name, " to be replaced")
2497+
fdbCluster.ReplacePod(storagePod, false)
2498+
processGroupID = fixtures.GetProcessGroupID(storagePod)
2499+
2500+
// Wait until the process group is marked for removal.
2501+
Eventually(func(g Gomega) {
2502+
processGroup := fdbv1beta2.FindProcessGroupByID(
2503+
fdbCluster.GetCluster().Status.ProcessGroups,
2504+
processGroupID,
2505+
)
2506+
2507+
out, err := json.Marshal(processGroup)
2508+
g.Expect(err).NotTo(HaveOccurred())
2509+
log.Println("waiting for marked to be removed", string(out))
2510+
g.Expect(processGroup).NotTo(BeNil())
2511+
g.Expect(processGroup.RemovalTimestamp).NotTo(BeNil())
2512+
}).WithTimeout(10 * time.Minute).WithPolling(1 * time.Second).To(Succeed())
2513+
2514+
initialCustomParameters = fdbCluster.GetCustomParameters(
2515+
fdbv1beta2.ProcessClassStorage,
2516+
)
2517+
newCustomParameters := append(
2518+
initialCustomParameters,
2519+
"knob_max_trace_lines=1000000",
2520+
)
2521+
2522+
// Disable the operator bounce feature to ensure the targeted process group sees the changes.
2523+
fdbCluster.SetKillProcesses(false, false)
2524+
2525+
Expect(
2526+
fdbCluster.SetCustomParameters(
2527+
map[fdbv1beta2.ProcessClass]fdbv1beta2.FoundationDBCustomParameters{
2528+
fdbv1beta2.ProcessClassStorage: newCustomParameters,
2529+
},
2530+
false,
2531+
),
2532+
).NotTo(HaveOccurred())
2533+
2534+
// Wait until the process group has the incorrect command line
2535+
Eventually(func(g Gomega) {
2536+
processGroup := fdbv1beta2.FindProcessGroupByID(
2537+
fdbCluster.GetCluster().Status.ProcessGroups,
2538+
processGroupID,
2539+
)
2540+
2541+
out, err := json.Marshal(processGroup)
2542+
g.Expect(err).NotTo(HaveOccurred())
2543+
log.Println("Waiting for incorrect commandline", string(out))
2544+
g.Expect(processGroup).NotTo(BeNil())
2545+
g.Expect(processGroup.GetConditionTime(fdbv1beta2.IncorrectCommandLine)).
2546+
NotTo(BeNil())
2547+
}).WithTimeout(10 * time.Minute).WithPolling(1 * time.Second).To(Succeed())
2548+
})
2549+
2550+
AfterEach(func() {
2551+
spec := fdbCluster.GetCluster().Spec.DeepCopy()
2552+
spec.AutomationOptions.DeletionMode = fdbv1beta2.PodUpdateModeZone
2553+
fdbCluster.UpdateClusterSpecWithSpec(spec)
2554+
2555+
Expect(fdbCluster.SetCustomParameters(
2556+
map[fdbv1beta2.ProcessClass]fdbv1beta2.FoundationDBCustomParameters{
2557+
fdbv1beta2.ProcessClassStorage: initialCustomParameters,
2558+
},
2559+
false,
2560+
)).NotTo(HaveOccurred())
2561+
})
2562+
2563+
It("should update the pod that is marked for removal", func() {
2564+
log.Println(
2565+
"Make sure process group",
2566+
processGroupID,
2567+
"will be updated",
2568+
)
2569+
2570+
// Enable the kill feature again.
2571+
fdbCluster.SetKillProcesses(true, false)
2572+
// Make sure the process group is updated after some time.
2573+
Eventually(func(g Gomega) {
2574+
processGroup := fdbv1beta2.FindProcessGroupByID(
2575+
fdbCluster.GetCluster().Status.ProcessGroups,
2576+
processGroupID,
2577+
)
2578+
2579+
out, err := json.Marshal(processGroup)
2580+
g.Expect(err).NotTo(HaveOccurred())
2581+
log.Println("Waiting for process group to be updated", string(out))
2582+
// Process group should still exist.
2583+
g.Expect(processGroup).NotTo(BeNil())
2584+
// Process group should be marked for removal.
2585+
g.Expect(processGroup.RemovalTimestamp).NotTo(BeNil())
2586+
// Process group should be updated with the new knobs.
2587+
g.Expect(processGroup.GetConditionTime(fdbv1beta2.IncorrectCommandLine)).
2588+
To(BeNil())
2589+
}).WithTimeout(10 * time.Minute).WithPolling(5 * time.Second).Should(Succeed())
2590+
})
2591+
},
2592+
)
24802593
})

internal/coordination/coordination.go

Lines changed: 35 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,9 @@ import (
2626
"strings"
2727
"time"
2828

29-
fdbv1beta2 "github.com/FoundationDB/fdb-kubernetes-operator/v2/api/v1beta2"
3029
"github.com/FoundationDB/fdb-kubernetes-operator/v2/internal/restarts"
30+
31+
fdbv1beta2 "github.com/FoundationDB/fdb-kubernetes-operator/v2/api/v1beta2"
3132
"github.com/FoundationDB/fdb-kubernetes-operator/v2/pkg/fdbadminclient"
3233
"github.com/go-logr/logr"
3334
)
@@ -400,6 +401,29 @@ func UpdateGlobalCoordinationState(
400401
continue
401402
}
402403

404+
// If the process group is missing long enough to be ignored, ensure that it's removed from the pending
405+
// and the ready list.
406+
if processGroup.GetConditionTime(fdbv1beta2.IncorrectCommandLine) != nil &&
407+
!restarts.ShouldBeIgnoredBecauseMissing(logger, cluster, processGroup) {
408+
// Check if the process group is present in pendingForRestart.
409+
// If not add it to the according set.
410+
if _, ok := pendingForRestart[processGroup.ProcessGroupID]; !ok {
411+
updatesPendingForRestart[processGroup.ProcessGroupID] = fdbv1beta2.UpdateActionAdd
412+
}
413+
} else {
414+
// Check if the process group is present in pendingForRestart or readyForRestart.
415+
// If so, add them to the set to remove those entries as the process has the correct command line.
416+
if _, ok := pendingForRestart[processGroup.ProcessGroupID]; ok {
417+
logger.V(1).Info("Removing from pendingForRestart", "processGroupID", processGroup.ProcessGroupID)
418+
updatesPendingForRestart[processGroup.ProcessGroupID] = fdbv1beta2.UpdateActionDelete
419+
}
420+
421+
if _, ok := readyForRestart[processGroup.ProcessGroupID]; ok {
422+
logger.V(1).Info("Removing from readyForRestart", "processGroupID", processGroup.ProcessGroupID)
423+
updatesReadyForRestart[processGroup.ProcessGroupID] = fdbv1beta2.UpdateActionDelete
424+
}
425+
}
426+
403427
// Keep track of the visited process group to remove entries from removed process groups.
404428
visited[processGroup.ProcessGroupID] = fdbv1beta2.None{}
405429
if processGroup.IsMarkedForRemoval() {
@@ -457,44 +481,29 @@ func UpdateGlobalCoordinationState(
457481
updatesReadyForInclusion[processGroup.ProcessGroupID] = fdbv1beta2.UpdateActionAdd
458482
}
459483
}
460-
461-
// if the process group is excluded, we don't need to restart it.
462-
if _, ok := pendingForRestart[processGroup.ProcessGroupID]; ok {
463-
logger.V(1).
464-
Info("Removing from pendingForRestart", "processGroupID", processGroup.ProcessGroupID, "reason", reason)
465-
updatesPendingForRestart[processGroup.ProcessGroupID] = fdbv1beta2.UpdateActionDelete
466-
}
467-
468-
if _, ok := readyForRestart[processGroup.ProcessGroupID]; ok {
469-
logger.V(1).
470-
Info("Removing from readyForRestart", "processGroupID", processGroup.ProcessGroupID, "reason", reason)
471-
updatesReadyForRestart[processGroup.ProcessGroupID] = fdbv1beta2.UpdateActionDelete
472-
}
473484
}
474485

475486
// If the process group is stuck in terminating, we can add it to the ready for inclusion list.
476487
if processGroup.GetConditionTime(fdbv1beta2.ResourcesTerminating) != nil {
477488
if _, ok := pendingForInclusion[processGroup.ProcessGroupID]; !ok {
478489
logger.V(1).
479-
Info("Adding to pendingForInclusion and readyForInclusion", "processGroupID", processGroup.ProcessGroupID, "reason", "process group is marked for removal and in terminating")
490+
Info("Adding to pendingForInclusion and readyForInclusion", "processGroupID", processGroup.ProcessGroupID, "reason", "process group is in terminating state")
480491
updatesPendingForInclusion[processGroup.ProcessGroupID] = fdbv1beta2.UpdateActionAdd
481492
updatesReadyForInclusion[processGroup.ProcessGroupID] = fdbv1beta2.UpdateActionAdd
482493
}
483494

484495
// If the process group is marked for removal and the resources are stuck in terminating or the processes are not running, we should
485496
// remove them from the restart list, because there are no processes to restart.
486-
if processGroup.GetConditionTime(fdbv1beta2.MissingProcesses) != nil {
487-
if _, ok := pendingForRestart[processGroup.ProcessGroupID]; ok {
488-
logger.V(1).
489-
Info("Removing from pendingForRestart", "processGroupID", processGroup.ProcessGroupID, "reason", "process group is marked for removal")
490-
updatesPendingForRestart[processGroup.ProcessGroupID] = fdbv1beta2.UpdateActionDelete
491-
}
497+
if _, ok := pendingForRestart[processGroup.ProcessGroupID]; ok {
498+
logger.V(1).
499+
Info("Removing from pendingForRestart", "processGroupID", processGroup.ProcessGroupID, "reason", "process group is in terminating state")
500+
updatesPendingForRestart[processGroup.ProcessGroupID] = fdbv1beta2.UpdateActionDelete
501+
}
492502

493-
if _, ok := readyForRestart[processGroup.ProcessGroupID]; ok {
494-
logger.V(1).
495-
Info("Removing from readyForRestart", "processGroupID", processGroup.ProcessGroupID, "reason", "process group is marked for removal")
496-
updatesReadyForRestart[processGroup.ProcessGroupID] = fdbv1beta2.UpdateActionDelete
497-
}
503+
if _, ok := readyForRestart[processGroup.ProcessGroupID]; ok {
504+
logger.V(1).
505+
Info("Removing from readyForRestart", "processGroupID", processGroup.ProcessGroupID, "reason", "process group is in terminating state")
506+
updatesReadyForRestart[processGroup.ProcessGroupID] = fdbv1beta2.UpdateActionDelete
498507
}
499508
}
500509

@@ -507,29 +516,6 @@ func UpdateGlobalCoordinationState(
507516

508517
continue
509518
}
510-
511-
// If the process group is missing long enough to be ignored, ensure that it's removed from the pending
512-
// and the ready list.
513-
if processGroup.GetConditionTime(fdbv1beta2.IncorrectCommandLine) != nil &&
514-
!restarts.ShouldBeIgnoredBecauseMissing(logger, cluster, processGroup) {
515-
// Check if the process group is present in pendingForRestart.
516-
// If not add it to the according set.
517-
if _, ok := pendingForRestart[processGroup.ProcessGroupID]; !ok {
518-
updatesPendingForRestart[processGroup.ProcessGroupID] = fdbv1beta2.UpdateActionAdd
519-
}
520-
} else {
521-
// Check if the process group is present in pendingForRestart or readyForRestart.
522-
// If so, add them to the set to remove those entries as the process has the correct command line.
523-
if _, ok := pendingForRestart[processGroup.ProcessGroupID]; ok {
524-
logger.V(1).Info("Removing from pendingForRestart", "processGroupID", processGroup.ProcessGroupID)
525-
updatesPendingForRestart[processGroup.ProcessGroupID] = fdbv1beta2.UpdateActionDelete
526-
}
527-
528-
if _, ok := readyForRestart[processGroup.ProcessGroupID]; ok {
529-
logger.V(1).Info("Removing from readyForRestart", "processGroupID", processGroup.ProcessGroupID)
530-
updatesReadyForRestart[processGroup.ProcessGroupID] = fdbv1beta2.UpdateActionDelete
531-
}
532-
}
533519
}
534520

535521
// Iterate over all the sets and mark all entries that are associated with a removed process group to be

0 commit comments

Comments
 (0)