Skip to content

Commit f9866f9

Browse files
authored
K8SPS-280: Improve full cluster crash recovery (#404)
* K8SPS-280: Improve full cluster crash recovery Before these changes, we were rebooting the cluster from complete outage from pod-0, without checking which member has the latest transactions. Therefore our full cluster recovery implementation was prone to data loss. Now we're using mysql-shell's built-in checks to detect the member to reboot from. For this, mysql-shell requires every member to be reachable, so it can connect and check GTID's in each one. That means in case of full cluster crash we need to start each pod and ensure they're reachable. We're bringing back the `/var/lib/mysql/full-cluster-crash` to address this requirement. Pods create this file if they detect they're in full cluster crash and restart themselves. After the restart, they'll start the mysqld process but ensure the server started as read only. After all pods up and running (ready), the operator will run `dba.rebootClusterFromCompleteOutage()` in one of the MySQL pods. In which pod we run this is not important, since mysql-shell will connect to each pod and select the suitable one to reboot. *Events* This commit also introduces the event recorder and two events: 1. FullClusterCrashDetected 2. FullClusterCrashRecovered Users will be able to see these events on `PerconaServerMySQL` object. For example: ``` $ kubectl describe ps cluster1 ... Events: Type Reason Age From Message ---- ------ ---- ---- ------- Warning FullClusterCrashDetected 19m (x10 over 20m) ps-controller Full cluster crash detected Normal FullClusterCrashRecovered 17m ps-controller Cluster recovered from full cluster crash ``` *Probe timeouts* Kubernetes had some problems with timeouts in exec probes which they fixed in recent releases. But we still see problematic behaviors. For example, even though Kubernetes successfully detects the timeout in probe it doesn't count the timeouts as failure. So container is not restarted even if its liveness probe timed out million times. With this commit we're handling timeouts by ourselves with contexts. * fix limits test * simplify exec commands * add autoRecovery field * don't reboot cluster more than necessary * fix unit tests * improve logs
1 parent 22f6d11 commit f9866f9

File tree

24 files changed

+601
-278
lines changed

24 files changed

+601
-278
lines changed

api/v1alpha1/perconaservermysql_types.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -85,8 +85,9 @@ func (t ClusterType) isValid() bool {
8585
}
8686

8787
type MySQLSpec struct {
88-
ClusterType ClusterType `json:"clusterType,omitempty"`
89-
Expose ServiceExposeTogglable `json:"expose,omitempty"`
88+
ClusterType ClusterType `json:"clusterType,omitempty"`
89+
Expose ServiceExposeTogglable `json:"expose,omitempty"`
90+
AutoRecovery bool `json:"autoRecovery,omitempty"`
9091

9192
Sidecars []corev1.Container `json:"sidecars,omitempty"`
9293
SidecarVolumes []corev1.Volume `json:"sidecarVolumes,omitempty"`
@@ -480,7 +481,7 @@ func (cr *PerconaServerMySQL) CheckNSetDefaults(ctx context.Context, serverVersi
480481
cr.Spec.MySQL.LivenessProbe.SuccessThreshold = 1
481482
}
482483
if cr.Spec.MySQL.LivenessProbe.TimeoutSeconds == 0 {
483-
cr.Spec.MySQL.LivenessProbe.TimeoutSeconds = 30
484+
cr.Spec.MySQL.LivenessProbe.TimeoutSeconds = 10
484485
}
485486

486487
if cr.Spec.MySQL.ReadinessProbe.InitialDelaySeconds == 0 {

build/ps-entrypoint.sh

Lines changed: 29 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,12 @@ load_group_replication_plugin() {
182182
POD_IP=$(hostname -I | awk '{print $1}')
183183

184184
sed -i "/\[mysqld\]/a plugin_load_add=group_replication.so" $CFG
185+
sed -i "/\[mysqld\]/a group_replication_exit_state_action=ABORT_SERVER" $CFG
186+
}
187+
188+
ensure_read_only() {
189+
sed -i "/\[mysqld\]/a read_only=ON" $CFG
190+
sed -i "/\[mysqld\]/a super_read_only=ON" $CFG
185191
}
186192

187193
MYSQL_VERSION=$(mysqld -V | awk '{print $3}' | awk -F'.' '{print $1"."$2}')
@@ -399,34 +405,29 @@ if [[ -f /var/lib/mysql/full-cluster-crash ]]; then
399405
namespace=$(</var/run/secrets/kubernetes.io/serviceaccount/namespace)
400406

401407
echo "######FULL_CLUSTER_CRASH:${node_name}######"
402-
echo "You have full cluster crash. You need to recover the cluster manually. Here are the steps:"
403-
echo ""
404-
echo "Latest GTID_EXECUTED in this node is ${gtid_executed}"
405-
echo "Compare GTIDs in each MySQL pod and select the one with the newest GTID."
406-
echo ""
407-
echo "Create /var/lib/mysql/force-bootstrap inside the mysql container. For example, if you select ${cluster_name}-mysql-2 to recover from:"
408-
echo "$ kubectl -n ${namespace} exec ${cluster_name}-mysql-2 -c mysql -- touch /var/lib/mysql/force-bootstrap"
409-
echo ""
410-
echo "Remove /var/lib/mysql/full-cluster-crash in this pod to re-bootstrap the group. For example:"
411-
echo "$ kubectl -n ${namespace} exec ${cluster_name}-mysql-2 -c mysql -- rm /var/lib/mysql/full-cluster-crash"
412-
echo "This will restart the mysql container."
413-
echo ""
414-
echo "After group is bootstrapped and mysql container is ready, move on to the other pods:"
415-
echo "$ kubectl -n ${namespace} exec ${cluster_name}-mysql-1 -c mysql -- rm /var/lib/mysql/full-cluster-crash"
416-
echo "Wait until the pod ready"
417-
echo ""
418-
echo "$ kubectl -n ${namespace} exec ${cluster_name}-mysql-0 -c mysql -- rm /var/lib/mysql/full-cluster-crash"
419-
echo "Wait until the pod ready"
420-
echo ""
421-
echo "Continue to other pods if you have more."
422-
echo "#####LAST_LINE:${node_name}:${gtid_executed}"
423-
424-
for (( ; ; )); do
425-
if [[ ! -f /var/lib/mysql/full-cluster-crash ]]; then
426-
exit 0
427-
fi
428-
sleep 5
429-
done
408+
echo "You are in a full cluster crash. Operator will attempt to fix the issue automatically if you have spec.mysql.autoRecovery enabled."
409+
echo "MySQL pods will be up and running in read only mode."
410+
echo "Latest GTID_EXECUTED on this node is ${gtid_executed}"
411+
echo "If you have spec.mysql.autoRecovery disabled, wait for all pods to be up and running and connect to one of them using mysql-shell:"
412+
echo "kubectl -n ${namespace} exec -it $(hostname) -- mysqlsh root:<password>@localhost"
413+
echo "and run the following command to reboot cluster:"
414+
echo "dba.rebootClusterFromCompleteOutage()"
415+
echo "and delete /var/lib/mysql/full-cluster-crash file in each pod."
416+
echo "######FULL_CLUSTER_CRASH:${node_name}######"
417+
418+
ensure_read_only
419+
fi
420+
421+
recovery_file='/var/lib/mysql/sleep-forever'
422+
if [ -f "${recovery_file}" ]; then
423+
set +o xtrace
424+
echo "The $recovery_file file is detected, node is going to infinity loop"
425+
echo "If you want to exit from infinity loop you need to remove $recovery_file file"
426+
for (( ; ; )); do
427+
if [ ! -f "${recovery_file}" ]; then
428+
exit 0
429+
fi
430+
done
430431
fi
431432

432433
exec "$@"

cmd/bootstrap/async_replication.go

Lines changed: 21 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package main
22

33
import (
4+
"context"
45
"log"
56
"os"
67
"path/filepath"
@@ -14,7 +15,7 @@ import (
1415
"github.com/percona/percona-server-mysql-operator/pkg/replicator"
1516
)
1617

17-
func bootstrapAsyncReplication() error {
18+
func bootstrapAsyncReplication(ctx context.Context) error {
1819
timer := stopwatch.NewNamedStopwatch()
1920
err := timer.AddMany([]string{"clone", "total"})
2021
if err != nil {
@@ -45,7 +46,7 @@ func bootstrapAsyncReplication() error {
4546
return errors.Wrap(err, "wait lock removal")
4647
}
4748
}
48-
primary, replicas, err := getTopology(peers)
49+
primary, replicas, err := getTopology(ctx, peers)
4950
if err != nil {
5051
return errors.Wrap(err, "select donor")
5152
}
@@ -74,7 +75,7 @@ func bootstrapAsyncReplication() error {
7475
}
7576
log.Printf("PrimaryIP: %s", primaryIp)
7677

77-
donor, err := selectDonor(fqdn, primary, replicas)
78+
donor, err := selectDonor(ctx, fqdn, primary, replicas)
7879
if err != nil {
7980
return errors.Wrap(err, "select donor")
8081
}
@@ -86,33 +87,33 @@ func bootstrapAsyncReplication() error {
8687
return errors.Wrapf(err, "get %s password", apiv1alpha1.UserOperator)
8788
}
8889

89-
db, err := replicator.NewReplicator("operator", operatorPass, podIp, mysql.DefaultAdminPort)
90+
db, err := replicator.NewReplicator(ctx, "operator", operatorPass, podIp, mysql.DefaultAdminPort)
9091
if err != nil {
9192
return errors.Wrap(err, "connect to db")
9293
}
9394
defer db.Close()
9495

95-
if err := db.StopReplication(); err != nil {
96+
if err := db.StopReplication(ctx); err != nil {
9697
return err
9798
}
9899

99100
switch {
100101
case donor == "":
101-
if err := db.ResetReplication(); err != nil {
102+
if err := db.ResetReplication(ctx); err != nil {
102103
return err
103104
}
104105

105106
log.Printf("Can't find a donor, we're on our own.")
106107
return nil
107108
case donor == fqdn:
108-
if err := db.ResetReplication(); err != nil {
109+
if err := db.ResetReplication(ctx); err != nil {
109110
return err
110111
}
111112

112113
log.Printf("I'm the donor and therefore the primary.")
113114
return nil
114115
case primary == fqdn || primaryIp == podIp:
115-
if err := db.ResetReplication(); err != nil {
116+
if err := db.ResetReplication(ctx); err != nil {
116117
return err
117118
}
118119

@@ -129,7 +130,7 @@ func bootstrapAsyncReplication() error {
129130
log.Printf("Clone required: %t", requireClone)
130131
if requireClone {
131132
log.Println("Checking if a clone in progress")
132-
inProgress, err := db.CloneInProgress()
133+
inProgress, err := db.CloneInProgress(ctx)
133134
if err != nil {
134135
return errors.Wrap(err, "check if a clone in progress")
135136
}
@@ -141,7 +142,7 @@ func bootstrapAsyncReplication() error {
141142

142143
timer.Start("clone")
143144
log.Printf("Cloning from %s", donor)
144-
err = db.Clone(donor, "operator", operatorPass, mysql.DefaultAdminPort)
145+
err = db.Clone(ctx, donor, "operator", operatorPass, mysql.DefaultAdminPort)
145146
timer.Stop("clone")
146147
if err != nil && !errors.Is(err, replicator.ErrRestartAfterClone) {
147148
return errors.Wrapf(err, "clone from donor %s", donor)
@@ -164,7 +165,7 @@ func bootstrapAsyncReplication() error {
164165
}
165166
}
166167

167-
rStatus, _, err := db.ReplicationStatus()
168+
rStatus, _, err := db.ReplicationStatus(ctx)
168169
if err != nil {
169170
return errors.Wrap(err, "check replication status")
170171
}
@@ -177,23 +178,23 @@ func bootstrapAsyncReplication() error {
177178
return errors.Wrapf(err, "get %s password", apiv1alpha1.UserReplication)
178179
}
179180

180-
if err := db.StopReplication(); err != nil {
181+
if err := db.StopReplication(ctx); err != nil {
181182
return errors.Wrap(err, "stop replication")
182183
}
183184

184-
if err := db.StartReplication(primary, replicaPass, mysql.DefaultPort); err != nil {
185+
if err := db.StartReplication(ctx, primary, replicaPass, mysql.DefaultPort); err != nil {
185186
return errors.Wrap(err, "start replication")
186187
}
187188
}
188189

189-
if err := db.EnableSuperReadonly(); err != nil {
190+
if err := db.EnableSuperReadonly(ctx); err != nil {
190191
return errors.Wrap(err, "enable super read only")
191192
}
192193

193194
return nil
194195
}
195196

196-
func getTopology(peers sets.Set[string]) (string, []string, error) {
197+
func getTopology(ctx context.Context, peers sets.Set[string]) (string, []string, error) {
197198
replicas := sets.New[string]()
198199
primary := ""
199200

@@ -203,18 +204,18 @@ func getTopology(peers sets.Set[string]) (string, []string, error) {
203204
}
204205

205206
for _, peer := range sets.List(peers) {
206-
db, err := replicator.NewReplicator("operator", operatorPass, peer, mysql.DefaultAdminPort)
207+
db, err := replicator.NewReplicator(ctx, "operator", operatorPass, peer, mysql.DefaultAdminPort)
207208
if err != nil {
208209
return "", nil, errors.Wrapf(err, "connect to %s", peer)
209210
}
210211
defer db.Close()
211212

212-
status, source, err := db.ReplicationStatus()
213+
status, source, err := db.ReplicationStatus(ctx)
213214
if err != nil {
214215
return "", nil, errors.Wrap(err, "check replication status")
215216
}
216217

217-
replicaHost, err := db.ReportHost()
218+
replicaHost, err := db.ReportHost(ctx)
218219
if err != nil {
219220
return "", nil, errors.Wrap(err, "get report_host")
220221
}
@@ -241,7 +242,7 @@ func getTopology(peers sets.Set[string]) (string, []string, error) {
241242
return primary, sets.List(replicas), nil
242243
}
243244

244-
func selectDonor(fqdn, primary string, replicas []string) (string, error) {
245+
func selectDonor(ctx context.Context, fqdn, primary string, replicas []string) (string, error) {
245246
donor := ""
246247

247248
operatorPass, err := getSecret(apiv1alpha1.UserOperator)
@@ -250,7 +251,7 @@ func selectDonor(fqdn, primary string, replicas []string) (string, error) {
250251
}
251252

252253
for _, replica := range replicas {
253-
db, err := replicator.NewReplicator("operator", operatorPass, replica, mysql.DefaultAdminPort)
254+
db, err := replicator.NewReplicator(ctx, "operator", operatorPass, replica, mysql.DefaultAdminPort)
254255
if err != nil {
255256
continue
256257
}

0 commit comments

Comments
 (0)