Skip to content

Commit 5a2b162

Browse files
authored
Restart processes only once when doing a setting rollout and include uptime from processes (#2164)
* Restart processes only once when doing a setting rollout and include uptime from processes
1 parent ef43912 commit 5a2b162

File tree

8 files changed

+114
-13
lines changed

8 files changed

+114
-13
lines changed

controllers/bounce_processes.go

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -162,12 +162,18 @@ func (c bounceProcesses) reconcile(_ context.Context, r *FoundationDBClusterReco
162162

163163
logger.Info("Bouncing processes", "addresses", addresses, "upgrading", upgrading)
164164
r.Recorder.Event(cluster, corev1.EventTypeNormal, "BouncingProcesses", fmt.Sprintf("Bouncing processes: %v", addresses))
165-
err = adminClient.KillProcesses(addresses)
165+
if upgrading {
166+
// When upgrading, we want to issue two restart commands to increase the probability that we are restarting all
167+
// processes in the cluster.
168+
err = adminClient.KillProcessesForUpgrade(addresses)
169+
} else {
170+
err = adminClient.KillProcesses(addresses)
171+
}
166172
if err != nil {
167173
return &requeue{curError: err}
168174
}
169175

170-
// Reset the SecondsSinceLastRecovered sine the operator just restarted some processes, which will could cause a recovery.
176+
// Reset the SecondsSinceLastRecovered since the operator just restarted some processes, which could cause a recovery.
171177
status.Cluster.RecoveryState.SecondsSinceLastRecovered = 0.0
172178

173179
// If the cluster was upgraded we will requeue and let the update_status command set the correct version.

e2e/test_operator/operator_test.go

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2619,4 +2619,48 @@ var _ = Describe("Operator", Label("e2e", "pr"), func() {
26192619
Expect(factory.GetControllerRuntimeClient().Update(context.Background(), pod)).NotTo(HaveOccurred())
26202620
})
26212621
})
2622+
2623+
When("a new knob for storage servers is rolled out to the cluster", func() {
2624+
var initialCustomParameters fdbv1beta2.FoundationDBCustomParameters
2625+
2626+
BeforeEach(func() {
2627+
initialCustomParameters = fdbCluster.GetCustomParameters(
2628+
fdbv1beta2.ProcessClassStorage,
2629+
)
2630+
2631+
Expect(
2632+
fdbCluster.SetCustomParameters(
2633+
fdbv1beta2.ProcessClassStorage,
2634+
append(
2635+
initialCustomParameters,
2636+
"knob_read_sampling_enabled=true",
2637+
),
2638+
false,
2639+
),
2640+
).NotTo(HaveOccurred())
2641+
})
2642+
2643+
AfterEach(func() {
2644+
Expect(fdbCluster.SetCustomParameters(
2645+
fdbv1beta2.ProcessClassStorage,
2646+
initialCustomParameters,
2647+
true,
2648+
)).NotTo(HaveOccurred())
2649+
})
2650+
2651+
It("should update the locality with the substituted environment variable", func() {
2652+
Eventually(func(g Gomega) bool {
2653+
for _, process := range fdbCluster.GetStatus().Cluster.Processes {
2654+
// We change the knob only for storage processes.
2655+
if process.ProcessClass != fdbv1beta2.ProcessClassStorage {
2656+
continue
2657+
}
2658+
2659+
g.Expect(process.CommandLine).To(ContainSubstring("knob_read_sampling_enabled"))
2660+
}
2661+
2662+
return true
2663+
}).WithTimeout(5 * time.Minute).WithPolling(5 * time.Second).Should(BeTrue())
2664+
})
2665+
})
26222666
})

fdbclient/admin_client.go

Lines changed: 30 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -434,18 +434,43 @@ func (client *cliAdminClient) CanSafelyRemove(addresses []fdbv1beta2.ProcessAddr
434434
return fdbstatus.CanSafelyRemoveFromStatus(client.log, client, addresses, status)
435435
}
436436

437+
func getKillCommand(addresses []fdbv1beta2.ProcessAddress, isUpgrade bool) string {
438+
var killCmd strings.Builder
439+
440+
addrString := fdbv1beta2.ProcessAddressesStringWithoutFlags(addresses, " ")
441+
killCmd.WriteString("kill; kill ")
442+
killCmd.WriteString(addrString)
443+
444+
if isUpgrade {
445+
killCmd.WriteString("; sleep 1; kill ")
446+
killCmd.WriteString(addrString)
447+
}
448+
killCmd.WriteString("; sleep 5")
449+
450+
return killCmd.String()
451+
}
452+
437453
// KillProcesses restarts processes
438454
func (client *cliAdminClient) KillProcesses(addresses []fdbv1beta2.ProcessAddress) error {
439455
if len(addresses) == 0 {
440456
return nil
441457
}
442458

443-
killCommand := fmt.Sprintf(
444-
"kill; kill %[1]s; sleep 1; kill %[1]s; sleep 5",
445-
fdbv1beta2.ProcessAddressesStringWithoutFlags(addresses, " "),
446-
)
447459
// Run the kill command once with the max timeout to reduce the risk of multiple recoveries happening.
448-
_, err := client.runCommand(cliCommand{command: killCommand, timeout: client.getTimeout()})
460+
_, err := client.runCommand(cliCommand{command: getKillCommand(addresses, false), timeout: client.getTimeout()})
461+
462+
return err
463+
}
464+
465+
// KillProcessesForUpgrade restarts processes for upgrades, this will issue 2 kill commands to make sure all
466+
// processes are restarted.
467+
func (client *cliAdminClient) KillProcessesForUpgrade(addresses []fdbv1beta2.ProcessAddress) error {
468+
if len(addresses) == 0 {
469+
return nil
470+
}
471+
472+
// Run the kill command once with the max timeout to reduce the risk of multiple recoveries happening.
473+
_, err := client.runCommand(cliCommand{command: getKillCommand(addresses, true), timeout: client.getTimeout()})
449474

450475
return err
451476
}

fdbclient/admin_client_test.go

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1293,4 +1293,19 @@ protocol fdb00b071010000`,
12931293
})
12941294
})
12951295
})
1296+
1297+
When("getting the kill command", func() {
1298+
addresses := []fdbv1beta2.ProcessAddress{{
1299+
IPAddress: net.ParseIP("192.168.0.2"),
1300+
Port: 4500,
1301+
}}
1302+
1303+
When("the cluster is upgraded ", func() {
1304+
Expect(getKillCommand(addresses, true)).To(Equal("kill; kill 192.168.0.2:4500; sleep 1; kill 192.168.0.2:4500; sleep 5"))
1305+
})
1306+
1307+
When("the cluster is not upgraded", func() {
1308+
Expect(getKillCommand(addresses, false)).To(Equal("kill; kill 192.168.0.2:4500; sleep 5"))
1309+
})
1310+
})
12961311
})

pkg/fdbadminclient/admin_client.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,10 @@ type AdminClient interface {
5656
// KillProcesses restarts processes
5757
KillProcesses(addresses []fdbv1beta2.ProcessAddress) error
5858

59+
// KillProcessesForUpgrade restarts processes for upgrades, this will issue 2 kill commands to make sure all
60+
// processes are restarted.
61+
KillProcessesForUpgrade(addresses []fdbv1beta2.ProcessAddress) error
62+
5963
// ChangeCoordinators changes the coordinator set
6064
ChangeCoordinators(addresses []fdbv1beta2.ProcessAddress) (string, error)
6165

pkg/fdbadminclient/mock/admin_client_mock.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -702,6 +702,12 @@ func (client *AdminClient) KillProcesses(addresses []fdbv1beta2.ProcessAddress)
702702
return nil
703703
}
704704

705+
// KillProcessesForUpgrade restarts processes for upgrades, this will issue 2 kill commands to make sure all
706+
// processes are restarted.
707+
func (client *AdminClient) KillProcessesForUpgrade(addresses []fdbv1beta2.ProcessAddress) error {
708+
return client.KillProcesses(addresses)
709+
}
710+
705711
// ChangeCoordinators changes the coordinator set
706712
func (client *AdminClient) ChangeCoordinators(addresses []fdbv1beta2.ProcessAddress) (string, error) {
707713
adminClientMutex.Lock()

pkg/fdbstatus/status_checks.go

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -283,12 +283,10 @@ func GetMinimumUptimeAndAddressMap(logger logr.Logger, cluster *fdbv1beta2.Found
283283
return 0, nil, err
284284
}
285285

286-
useRecoveryState := runningVersion.SupportsRecoveryState() && recoveryStateEnabled
287-
288286
addressMap := make(map[fdbv1beta2.ProcessGroupID][]fdbv1beta2.ProcessAddress, len(status.Cluster.Processes))
289287

290288
minimumUptime := math.Inf(1)
291-
if useRecoveryState {
289+
if runningVersion.SupportsRecoveryState() && recoveryStateEnabled {
292290
minimumUptime = status.Cluster.RecoveryState.SecondsSinceLastRecovered
293291
}
294292

@@ -307,7 +305,7 @@ func GetMinimumUptimeAndAddressMap(logger logr.Logger, cluster *fdbv1beta2.Found
307305

308306
addressMap[fdbv1beta2.ProcessGroupID(processGroupID)] = append(addressMap[fdbv1beta2.ProcessGroupID(process.Locality[fdbv1beta2.FDBLocalityInstanceIDKey])], process.Address)
309307

310-
if useRecoveryState || process.Excluded {
308+
if process.Excluded {
311309
continue
312310
}
313311

@@ -317,7 +315,10 @@ func GetMinimumUptimeAndAddressMap(logger logr.Logger, cluster *fdbv1beta2.Found
317315
continue
318316
}
319317

318+
// If the SecondsSinceLastRecovered is higher than the process uptime, we want to use the process uptime, e.g. in
319+
// cases where only storage processes are restarted.
320320
if process.UptimeSeconds < minimumUptime {
321+
logger.V(1).Info("Process uptime is less than the last recovery", "processGroupID", process.Address, "minimumUptime", minimumUptime, "process.UptimeSeconds", process.UptimeSeconds)
321322
minimumUptime = process.UptimeSeconds
322323
}
323324
}

pkg/fdbstatus/status_checks_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -842,7 +842,7 @@ var _ = Describe("status_checks", func() {
842842
},
843843
},
844844
}),
845-
Entry("when recovered since is enabled and version supports it",
845+
Entry("when recovered since is enabled and version supports it, it should use the minimum of recovered since or process uptime",
846846
&fdbv1beta2.FoundationDBCluster{
847847
Spec: fdbv1beta2.FoundationDBClusterSpec{
848848
Version: fdbv1beta2.Versions.SupportsRecoveryState.String(),
@@ -866,7 +866,7 @@ var _ = Describe("status_checks", func() {
866866
},
867867
},
868868
true,
869-
90.0,
869+
30.0,
870870
map[fdbv1beta2.ProcessGroupID][]fdbv1beta2.ProcessAddress{
871871
"test": {
872872
{

0 commit comments

Comments
 (0)