percona
diff --git a/‎api/v1alpha1/perconaservermysql_types.go
Lines changed: 4 additions & 3 deletions b/‎api/v1alpha1/perconaservermysql_types.go
Lines changed: 4 additions & 3 deletions
diff --git a/‎build/ps-entrypoint.sh
Lines changed: 29 additions & 28 deletions b/‎build/ps-entrypoint.sh
Lines changed: 29 additions & 28 deletions
diff --git a/‎cmd/bootstrap/async_replication.go
Lines changed: 21 additions & 20 deletions b/‎cmd/bootstrap/async_replication.go
Lines changed: 21 additions & 20 deletions
@@ -85,8 +85,9 @@ func (t ClusterType) isValid() bool {
 }
 
 type MySQLSpec struct {
-	ClusterType ClusterType            `json:"clusterType,omitempty"`
-	Expose      ServiceExposeTogglable `json:"expose,omitempty"`
+	ClusterType  ClusterType            `json:"clusterType,omitempty"`
+	Expose       ServiceExposeTogglable `json:"expose,omitempty"`
+	AutoRecovery bool                   `json:"autoRecovery,omitempty"`
 
 	Sidecars       []corev1.Container `json:"sidecars,omitempty"`
 	SidecarVolumes []corev1.Volume    `json:"sidecarVolumes,omitempty"`
@@ -480,7 +481,7 @@ func (cr *PerconaServerMySQL) CheckNSetDefaults(ctx context.Context, serverVersi
 		cr.Spec.MySQL.LivenessProbe.SuccessThreshold = 1
 	}
 	if cr.Spec.MySQL.LivenessProbe.TimeoutSeconds == 0 {
-		cr.Spec.MySQL.LivenessProbe.TimeoutSeconds = 30
+		cr.Spec.MySQL.LivenessProbe.TimeoutSeconds = 10
 	}
 
 	if cr.Spec.MySQL.ReadinessProbe.InitialDelaySeconds == 0 {
 
@@ -182,6 +182,12 @@ load_group_replication_plugin() {
 	POD_IP=$(hostname -I | awk '{print $1}')
 
 	sed -i "/\[mysqld\]/a plugin_load_add=group_replication.so" $CFG
+	sed -i "/\[mysqld\]/a group_replication_exit_state_action=ABORT_SERVER" $CFG
+}
+
+ensure_read_only() {
+	sed -i "/\[mysqld\]/a read_only=ON" $CFG
+	sed -i "/\[mysqld\]/a super_read_only=ON" $CFG
 }
 
 MYSQL_VERSION=$(mysqld -V | awk '{print $3}' | awk -F'.' '{print $1"."$2}')
@@ -399,34 +405,29 @@ if [[ -f /var/lib/mysql/full-cluster-crash ]]; then
 	namespace=$(</var/run/secrets/kubernetes.io/serviceaccount/namespace)
 
 	echo "######FULL_CLUSTER_CRASH:${node_name}######"
-	echo "You have full cluster crash. You need to recover the cluster manually. Here are the steps:"
-	echo ""
-	echo "Latest GTID_EXECUTED in this node is ${gtid_executed}"
-	echo "Compare GTIDs in each MySQL pod and select the one with the newest GTID."
-	echo ""
-	echo "Create /var/lib/mysql/force-bootstrap inside the mysql container. For example, if you select ${cluster_name}-mysql-2 to recover from:"
-	echo "$ kubectl -n ${namespace} exec ${cluster_name}-mysql-2 -c mysql -- touch /var/lib/mysql/force-bootstrap"
-	echo ""
-	echo "Remove /var/lib/mysql/full-cluster-crash in this pod to re-bootstrap the group. For example:"
-	echo "$ kubectl -n ${namespace} exec ${cluster_name}-mysql-2 -c mysql -- rm /var/lib/mysql/full-cluster-crash"
-	echo "This will restart the mysql container."
-	echo ""
-	echo "After group is bootstrapped and mysql container is ready, move on to the other pods:"
-	echo "$ kubectl -n ${namespace} exec ${cluster_name}-mysql-1 -c mysql -- rm /var/lib/mysql/full-cluster-crash"
-	echo "Wait until the pod ready"
-	echo ""
-	echo "$ kubectl -n ${namespace} exec ${cluster_name}-mysql-0 -c mysql -- rm /var/lib/mysql/full-cluster-crash"
-	echo "Wait until the pod ready"
-	echo ""
-	echo "Continue to other pods if you have more."
-	echo "#####LAST_LINE:${node_name}:${gtid_executed}"
-
-	for (( ; ; )); do
-		if [[ ! -f /var/lib/mysql/full-cluster-crash ]]; then
-			exit 0
-		fi
-		sleep 5
-	done
+	echo "You are in a full cluster crash. Operator will attempt to fix the issue automatically if you have spec.mysql.autoRecovery enabled."
+	echo "MySQL pods will be up and running in read only mode."
+	echo "Latest GTID_EXECUTED on this node is ${gtid_executed}"
+	echo "If you have spec.mysql.autoRecovery disabled, wait for all pods to be up and running and connect to one of them using mysql-shell:"
+	echo "kubectl -n ${namespace} exec -it $(hostname) -- mysqlsh root:<password>@localhost"
+	echo "and run the following command to reboot cluster:"
+	echo "dba.rebootClusterFromCompleteOutage()"
+	echo "and delete /var/lib/mysql/full-cluster-crash file in each pod."
+	echo "######FULL_CLUSTER_CRASH:${node_name}######"
+
+	ensure_read_only
+fi
+
+recovery_file='/var/lib/mysql/sleep-forever'
+if [ -f "${recovery_file}" ]; then
+  set +o xtrace
+  echo "The $recovery_file file is detected, node is going to infinity loop"
+  echo "If you want to exit from infinity loop you need to remove $recovery_file file"
+  for (( ; ; )); do
+    if [ ! -f "${recovery_file}" ]; then
+      exit 0
+    fi
+  done
 fi
 
 exec "$@"
@@ -1,6 +1,7 @@
 package main
 
 import (
+	"context"
 	"log"
 	"os"
 	"path/filepath"
@@ -14,7 +15,7 @@ import (
 	"github.com/percona/percona-server-mysql-operator/pkg/replicator"
 )
 
-func bootstrapAsyncReplication() error {
+func bootstrapAsyncReplication(ctx context.Context) error {
 	timer := stopwatch.NewNamedStopwatch()
 	err := timer.AddMany([]string{"clone", "total"})
 	if err != nil {
@@ -45,7 +46,7 @@ func bootstrapAsyncReplication() error {
 			return errors.Wrap(err, "wait lock removal")
 		}
 	}
-	primary, replicas, err := getTopology(peers)
+	primary, replicas, err := getTopology(ctx, peers)
 	if err != nil {
 		return errors.Wrap(err, "select donor")
 	}
@@ -74,7 +75,7 @@ func bootstrapAsyncReplication() error {
 	}
 	log.Printf("PrimaryIP: %s", primaryIp)
 
-	donor, err := selectDonor(fqdn, primary, replicas)
+	donor, err := selectDonor(ctx, fqdn, primary, replicas)
 	if err != nil {
 		return errors.Wrap(err, "select donor")
 	}
@@ -86,33 +87,33 @@ func bootstrapAsyncReplication() error {
 		return errors.Wrapf(err, "get %s password", apiv1alpha1.UserOperator)
 	}
 
-	db, err := replicator.NewReplicator("operator", operatorPass, podIp, mysql.DefaultAdminPort)
+	db, err := replicator.NewReplicator(ctx, "operator", operatorPass, podIp, mysql.DefaultAdminPort)
 	if err != nil {
 		return errors.Wrap(err, "connect to db")
 	}
 	defer db.Close()
 
-	if err := db.StopReplication(); err != nil {
+	if err := db.StopReplication(ctx); err != nil {
 		return err
 	}
 
 	switch {
 	case donor == "":
-		if err := db.ResetReplication(); err != nil {
+		if err := db.ResetReplication(ctx); err != nil {
 			return err
 		}
 
 		log.Printf("Can't find a donor, we're on our own.")
 		return nil
 	case donor == fqdn:
-		if err := db.ResetReplication(); err != nil {
+		if err := db.ResetReplication(ctx); err != nil {
 			return err
 		}
 
 		log.Printf("I'm the donor and therefore the primary.")
 		return nil
 	case primary == fqdn || primaryIp == podIp:
-		if err := db.ResetReplication(); err != nil {
+		if err := db.ResetReplication(ctx); err != nil {
 			return err
 		}
 
@@ -129,7 +130,7 @@ func bootstrapAsyncReplication() error {
 	log.Printf("Clone required: %t", requireClone)
 	if requireClone {
 		log.Println("Checking if a clone in progress")
-		inProgress, err := db.CloneInProgress()
+		inProgress, err := db.CloneInProgress(ctx)
 		if err != nil {
 			return errors.Wrap(err, "check if a clone in progress")
 		}
@@ -141,7 +142,7 @@ func bootstrapAsyncReplication() error {
 
 		timer.Start("clone")
 		log.Printf("Cloning from %s", donor)
-		err = db.Clone(donor, "operator", operatorPass, mysql.DefaultAdminPort)
+		err = db.Clone(ctx, donor, "operator", operatorPass, mysql.DefaultAdminPort)
 		timer.Stop("clone")
 		if err != nil && !errors.Is(err, replicator.ErrRestartAfterClone) {
 			return errors.Wrapf(err, "clone from donor %s", donor)
@@ -164,7 +165,7 @@ func bootstrapAsyncReplication() error {
 		}
 	}
 
-	rStatus, _, err := db.ReplicationStatus()
+	rStatus, _, err := db.ReplicationStatus(ctx)
 	if err != nil {
 		return errors.Wrap(err, "check replication status")
 	}
@@ -177,23 +178,23 @@ func bootstrapAsyncReplication() error {
 			return errors.Wrapf(err, "get %s password", apiv1alpha1.UserReplication)
 		}
 
-		if err := db.StopReplication(); err != nil {
+		if err := db.StopReplication(ctx); err != nil {
 			return errors.Wrap(err, "stop replication")
 		}
 
-		if err := db.StartReplication(primary, replicaPass, mysql.DefaultPort); err != nil {
+		if err := db.StartReplication(ctx, primary, replicaPass, mysql.DefaultPort); err != nil {
 			return errors.Wrap(err, "start replication")
 		}
 	}
 
-	if err := db.EnableSuperReadonly(); err != nil {
+	if err := db.EnableSuperReadonly(ctx); err != nil {
 		return errors.Wrap(err, "enable super read only")
 	}
 
 	return nil
 }
 
-func getTopology(peers sets.Set[string]) (string, []string, error) {
+func getTopology(ctx context.Context, peers sets.Set[string]) (string, []string, error) {
 	replicas := sets.New[string]()
 	primary := ""
 
@@ -203,18 +204,18 @@ func getTopology(peers sets.Set[string]) (string, []string, error) {
 	}
 
 	for _, peer := range sets.List(peers) {
-		db, err := replicator.NewReplicator("operator", operatorPass, peer, mysql.DefaultAdminPort)
+		db, err := replicator.NewReplicator(ctx, "operator", operatorPass, peer, mysql.DefaultAdminPort)
 		if err != nil {
 			return "", nil, errors.Wrapf(err, "connect to %s", peer)
 		}
 		defer db.Close()
 
-		status, source, err := db.ReplicationStatus()
+		status, source, err := db.ReplicationStatus(ctx)
 		if err != nil {
 			return "", nil, errors.Wrap(err, "check replication status")
 		}
 
-		replicaHost, err := db.ReportHost()
+		replicaHost, err := db.ReportHost(ctx)
 		if err != nil {
 			return "", nil, errors.Wrap(err, "get report_host")
 		}
@@ -241,7 +242,7 @@ func getTopology(peers sets.Set[string]) (string, []string, error) {
 	return primary, sets.List(replicas), nil
 }
 
-func selectDonor(fqdn, primary string, replicas []string) (string, error) {
+func selectDonor(ctx context.Context, fqdn, primary string, replicas []string) (string, error) {
 	donor := ""
 
 	operatorPass, err := getSecret(apiv1alpha1.UserOperator)
@@ -250,7 +251,7 @@ func selectDonor(fqdn, primary string, replicas []string) (string, error) {
 	}
 
 	for _, replica := range replicas {
-		db, err := replicator.NewReplicator("operator", operatorPass, replica, mysql.DefaultAdminPort)
+		db, err := replicator.NewReplicator(ctx, "operator", operatorPass, replica, mysql.DefaultAdminPort)
 		if err != nil {
 			continue
 		}
Original file line number	Diff line number	Diff line change
`@@ -85,8 +85,9 @@ func (t ClusterType) isValid() bool {`
`85`	`85`	`}`
`86`	`86`
`87`	`87`	`type MySQLSpec struct {`
`88`		- ClusterType ClusterType `json:"clusterType,omitempty"`
`89`		- Expose ServiceExposeTogglable `json:"expose,omitempty"`
	`88`	+ ClusterType ClusterType `json:"clusterType,omitempty"`
	`89`	+ Expose ServiceExposeTogglable `json:"expose,omitempty"`
	`90`	+ AutoRecovery bool `json:"autoRecovery,omitempty"`
`90`	`91`
`91`	`92`	Sidecars []corev1.Container `json:"sidecars,omitempty"`
`92`	`93`	SidecarVolumes []corev1.Volume `json:"sidecarVolumes,omitempty"`
`@@ -480,7 +481,7 @@ func (cr *PerconaServerMySQL) CheckNSetDefaults(ctx context.Context, serverVersi`
`480`	`481`	`cr.Spec.MySQL.LivenessProbe.SuccessThreshold = 1`
`481`	`482`	`}`
`482`	`483`	`if cr.Spec.MySQL.LivenessProbe.TimeoutSeconds == 0 {`
`483`		`- cr.Spec.MySQL.LivenessProbe.TimeoutSeconds = 30`
	`484`	`+ cr.Spec.MySQL.LivenessProbe.TimeoutSeconds = 10`
`484`	`485`	`}`
`485`	`486`
`486`	`487`	`if cr.Spec.MySQL.ReadinessProbe.InitialDelaySeconds == 0 {`
Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,7 @@`
`1`	`1`	`package main`
`2`	`2`
`3`	`3`	`import (`
	`4`	`+ "context"`
`4`	`5`	`"log"`
`5`	`6`	`"os"`
`6`	`7`	`"path/filepath"`
`@@ -14,7 +15,7 @@ import (`
`14`	`15`	`"github.com/percona/percona-server-mysql-operator/pkg/replicator"`
`15`	`16`	`)`
`16`	`17`
`17`		`-func bootstrapAsyncReplication() error {`
	`18`	`+func bootstrapAsyncReplication(ctx context.Context) error {`
`18`	`19`	`timer := stopwatch.NewNamedStopwatch()`
`19`	`20`	`err := timer.AddMany([]string{"clone", "total"})`
`20`	`21`	`if err != nil {`
`@@ -45,7 +46,7 @@ func bootstrapAsyncReplication() error {`
`45`	`46`	`return errors.Wrap(err, "wait lock removal")`
`46`	`47`	`}`
`47`	`48`	`}`
`48`		`- primary, replicas, err := getTopology(peers)`
	`49`	`+ primary, replicas, err := getTopology(ctx, peers)`
`49`	`50`	`if err != nil {`
`50`	`51`	`return errors.Wrap(err, "select donor")`
`51`	`52`	`}`
`@@ -74,7 +75,7 @@ func bootstrapAsyncReplication() error {`
`74`	`75`	`}`
`75`	`76`	`log.Printf("PrimaryIP: %s", primaryIp)`
`76`	`77`
`77`		`- donor, err := selectDonor(fqdn, primary, replicas)`
	`78`	`+ donor, err := selectDonor(ctx, fqdn, primary, replicas)`
`78`	`79`	`if err != nil {`
`79`	`80`	`return errors.Wrap(err, "select donor")`
`80`	`81`	`}`
`@@ -86,33 +87,33 @@ func bootstrapAsyncReplication() error {`
`86`	`87`	`return errors.Wrapf(err, "get %s password", apiv1alpha1.UserOperator)`
`87`	`88`	`}`
`88`	`89`
`89`		`- db, err := replicator.NewReplicator("operator", operatorPass, podIp, mysql.DefaultAdminPort)`
	`90`	`+ db, err := replicator.NewReplicator(ctx, "operator", operatorPass, podIp, mysql.DefaultAdminPort)`
`90`	`91`	`if err != nil {`
`91`	`92`	`return errors.Wrap(err, "connect to db")`
`92`	`93`	`}`
`93`	`94`	`defer db.Close()`
`94`	`95`
`95`		`- if err := db.StopReplication(); err != nil {`
	`96`	`+ if err := db.StopReplication(ctx); err != nil {`
`96`	`97`	`return err`
`97`	`98`	`}`
`98`	`99`
`99`	`100`	`switch {`
`100`	`101`	`case donor == "":`
`101`		`- if err := db.ResetReplication(); err != nil {`
	`102`	`+ if err := db.ResetReplication(ctx); err != nil {`
`102`	`103`	`return err`
`103`	`104`	`}`
`104`	`105`
`105`	`106`	`log.Printf("Can't find a donor, we're on our own.")`
`106`	`107`	`return nil`
`107`	`108`	`case donor == fqdn:`
`108`		`- if err := db.ResetReplication(); err != nil {`
	`109`	`+ if err := db.ResetReplication(ctx); err != nil {`
`109`	`110`	`return err`
`110`	`111`	`}`
`111`	`112`
`112`	`113`	`log.Printf("I'm the donor and therefore the primary.")`
`113`	`114`	`return nil`
`114`	`115`	`case primary == fqdn \|\| primaryIp == podIp:`
`115`		`- if err := db.ResetReplication(); err != nil {`
	`116`	`+ if err := db.ResetReplication(ctx); err != nil {`
`116`	`117`	`return err`
`117`	`118`	`}`
`118`	`119`
`@@ -129,7 +130,7 @@ func bootstrapAsyncReplication() error {`
`129`	`130`	`log.Printf("Clone required: %t", requireClone)`
`130`	`131`	`if requireClone {`
`131`	`132`	`log.Println("Checking if a clone in progress")`
`132`		`- inProgress, err := db.CloneInProgress()`
	`133`	`+ inProgress, err := db.CloneInProgress(ctx)`
`133`	`134`	`if err != nil {`
`134`	`135`	`return errors.Wrap(err, "check if a clone in progress")`
`135`	`136`	`}`
`@@ -141,7 +142,7 @@ func bootstrapAsyncReplication() error {`
`141`	`142`
`142`	`143`	`timer.Start("clone")`
`143`	`144`	`log.Printf("Cloning from %s", donor)`
`144`		`- err = db.Clone(donor, "operator", operatorPass, mysql.DefaultAdminPort)`
	`145`	`+ err = db.Clone(ctx, donor, "operator", operatorPass, mysql.DefaultAdminPort)`
`145`	`146`	`timer.Stop("clone")`
`146`	`147`	`if err != nil && !errors.Is(err, replicator.ErrRestartAfterClone) {`
`147`	`148`	`return errors.Wrapf(err, "clone from donor %s", donor)`
`@@ -164,7 +165,7 @@ func bootstrapAsyncReplication() error {`
`164`	`165`	`}`
`165`	`166`	`}`
`166`	`167`
`167`		`- rStatus, _, err := db.ReplicationStatus()`
	`168`	`+ rStatus, _, err := db.ReplicationStatus(ctx)`
`168`	`169`	`if err != nil {`
`169`	`170`	`return errors.Wrap(err, "check replication status")`
`170`	`171`	`}`
`@@ -177,23 +178,23 @@ func bootstrapAsyncReplication() error {`
`177`	`178`	`return errors.Wrapf(err, "get %s password", apiv1alpha1.UserReplication)`
`178`	`179`	`}`
`179`	`180`
`180`		`- if err := db.StopReplication(); err != nil {`
	`181`	`+ if err := db.StopReplication(ctx); err != nil {`
`181`	`182`	`return errors.Wrap(err, "stop replication")`
`182`	`183`	`}`
`183`	`184`
`184`		`- if err := db.StartReplication(primary, replicaPass, mysql.DefaultPort); err != nil {`
	`185`	`+ if err := db.StartReplication(ctx, primary, replicaPass, mysql.DefaultPort); err != nil {`
`185`	`186`	`return errors.Wrap(err, "start replication")`
`186`	`187`	`}`
`187`	`188`	`}`
`188`	`189`
`189`		`- if err := db.EnableSuperReadonly(); err != nil {`
	`190`	`+ if err := db.EnableSuperReadonly(ctx); err != nil {`
`190`	`191`	`return errors.Wrap(err, "enable super read only")`
`191`	`192`	`}`
`192`	`193`
`193`	`194`	`return nil`
`194`	`195`	`}`
`195`	`196`
`196`		`-func getTopology(peers sets.Set[string]) (string, []string, error) {`
	`197`	`+func getTopology(ctx context.Context, peers sets.Set[string]) (string, []string, error) {`
`197`	`198`	`replicas := sets.New[string]()`
`198`	`199`	`primary := ""`
`199`	`200`
`@@ -203,18 +204,18 @@ func getTopology(peers sets.Set[string]) (string, []string, error) {`
`203`	`204`	`}`
`204`	`205`
`205`	`206`	`for _, peer := range sets.List(peers) {`
`206`		`- db, err := replicator.NewReplicator("operator", operatorPass, peer, mysql.DefaultAdminPort)`
	`207`	`+ db, err := replicator.NewReplicator(ctx, "operator", operatorPass, peer, mysql.DefaultAdminPort)`
`207`	`208`	`if err != nil {`
`208`	`209`	`return "", nil, errors.Wrapf(err, "connect to %s", peer)`
`209`	`210`	`}`
`210`	`211`	`defer db.Close()`
`211`	`212`
`212`		`- status, source, err := db.ReplicationStatus()`
	`213`	`+ status, source, err := db.ReplicationStatus(ctx)`
`213`	`214`	`if err != nil {`
`214`	`215`	`return "", nil, errors.Wrap(err, "check replication status")`
`215`	`216`	`}`
`216`	`217`
`217`		`- replicaHost, err := db.ReportHost()`
	`218`	`+ replicaHost, err := db.ReportHost(ctx)`
`218`	`219`	`if err != nil {`
`219`	`220`	`return "", nil, errors.Wrap(err, "get report_host")`
`220`	`221`	`}`
`@@ -241,7 +242,7 @@ func getTopology(peers sets.Set[string]) (string, []string, error) {`
`241`	`242`	`return primary, sets.List(replicas), nil`
`242`	`243`	`}`
`243`	`244`
`244`		`-func selectDonor(fqdn, primary string, replicas []string) (string, error) {`
	`245`	`+func selectDonor(ctx context.Context, fqdn, primary string, replicas []string) (string, error) {`
`245`	`246`	`donor := ""`
`246`	`247`
`247`	`248`	`operatorPass, err := getSecret(apiv1alpha1.UserOperator)`
`@@ -250,7 +251,7 @@ func selectDonor(fqdn, primary string, replicas []string) (string, error) {`
`250`	`251`	`}`
`251`	`252`
`252`	`253`	`for _, replica := range replicas {`
`253`		`- db, err := replicator.NewReplicator("operator", operatorPass, replica, mysql.DefaultAdminPort)`
	`254`	`+ db, err := replicator.NewReplicator(ctx, "operator", operatorPass, replica, mysql.DefaultAdminPort)`
`254`	`255`	`if err != nil {`
`255`	`256`	`continue`
`256`	`257`	`}`