Skip to content

Commit 67d53bd

Browse files
committed
Fast bootstrap using safe_to_bootstrap flag
When a galera cluster is succesfully stopped, the last node to shut down sets a internal flag called "safe_to_bootstrap", that signals that this node should be the one to restart the cluster from. This can help to improve bootstrap as once this flag is found on a node, there is no need to inspect other nodes. This can also helps to bootstrap a cluster in case e.g. various nodes are unavailable or they suffered a db corruption. As soon as safe_to_bootstrap is read on a node, the cluster can be restarted from it. Change the way the mariadb operator extracts the sequence number information from galera nodes, to retrieve additional flags such as the "safe_to_bootstrap", and use it to improve the bootstrap node selection. Note that this requires that the cluster stops in a orchestrated way, which is not always the case, for instance if all pods are deleted at once. To get around it, add a PreStop hook script to sequentialize shutdown of pods and their disconnection from the galera cluster. Jira: OSPRH-10195
1 parent 61d230f commit 67d53bd

File tree

8 files changed

+201
-45
lines changed

8 files changed

+201
-45
lines changed

api/bases/mariadb.openstack.org_galeras.yaml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,10 +124,21 @@ spec:
124124
gcomm:
125125
description: Gcomm URI used to connect to the galera cluster
126126
type: string
127+
no_grastate:
128+
description: This galera node has its state recovered from the
129+
DB
130+
type: boolean
131+
safe_to_bootstrap:
132+
description: This galera node can bootstrap a galera cluster
133+
type: boolean
127134
seqno:
128135
description: Last recorded replication sequence number in the
129136
DB
130137
type: string
138+
uuid:
139+
description: UUID of the partition that is seen by the galera
140+
node
141+
type: string
131142
required:
132143
- seqno
133144
type: object

api/v1beta1/galera_types.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,8 +93,14 @@ type GaleraSpecCore struct {
9393

9494
// GaleraAttributes holds startup information for a Galera host
9595
type GaleraAttributes struct {
96+
// UUID of the partition that is seen by the galera node
97+
UUID string `json:"uuid,omitempty"`
9698
// Last recorded replication sequence number in the DB
9799
Seqno string `json:"seqno"`
100+
// This galera node can bootstrap a galera cluster
101+
SafeToBootstrap bool `json:"safe_to_bootstrap,omitempty"`
102+
// This galera node has its state recovered from the DB
103+
NoGrastate bool `json:"no_grastate,omitempty"`
98104
// Gcomm URI used to connect to the galera cluster
99105
Gcomm string `json:"gcomm,omitempty"`
100106
// Identifier of the container at the time the gcomm URI was injected

config/crd/bases/mariadb.openstack.org_galeras.yaml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,10 +124,21 @@ spec:
124124
gcomm:
125125
description: Gcomm URI used to connect to the galera cluster
126126
type: string
127+
no_grastate:
128+
description: This galera node has its state recovered from the
129+
DB
130+
type: boolean
131+
safe_to_bootstrap:
132+
description: This galera node can bootstrap a galera cluster
133+
type: boolean
127134
seqno:
128135
description: Last recorded replication sequence number in the
129136
DB
130137
type: string
138+
uuid:
139+
description: UUID of the partition that is seen by the galera
140+
node
141+
type: string
131142
required:
132143
- seqno
133144
type: object

controllers/galera_controller.go

Lines changed: 49 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ package controllers
1919
import (
2020
"bytes"
2121
"context"
22+
"encoding/json"
2223
"fmt"
2324
"sort"
2425
"strconv"
@@ -94,20 +95,31 @@ func GetLog(ctx context.Context, controller string) logr.Logger {
9495
//
9596

9697
// findBestCandidate returns the node with the lowest seqno
97-
func findBestCandidate(status *mariadbv1.GaleraStatus) string {
98-
sortednodes := maps.Keys(status.Attributes)
98+
func findBestCandidate(g *mariadbv1.Galera) (node string, found bool) {
99+
sortednodes := maps.Keys(g.Status.Attributes)
99100
sort.Strings(sortednodes)
100101
bestnode := ""
101102
bestseqno := -1
102103
for _, node := range sortednodes {
103-
seqno := status.Attributes[node].Seqno
104+
// On clean shutdown, galera sets the last
105+
// stopped node as 'safe to bootstrap', so use
106+
// this hint when we can
107+
if g.Status.Attributes[node].SafeToBootstrap {
108+
return node, true
109+
}
110+
seqno := g.Status.Attributes[node].Seqno
104111
intseqno, _ := strconv.Atoi(seqno)
105112
if intseqno >= bestseqno {
106113
bestnode = node
107114
bestseqno = intseqno
108115
}
109116
}
110-
return bestnode //"galera-0"
117+
// if we pass here, a candidate is only valid if we
118+
// inspected all the expected replicas (e.g. typically 3)
119+
if len(g.Status.Attributes) != int(*g.Spec.Replicas) {
120+
return "", false
121+
}
122+
return bestnode, true //"galera-0"
111123
}
112124

113125
// buildGcommURI builds a gcomm URI for a galera instance
@@ -230,18 +242,22 @@ func injectGcommURI(ctx context.Context, h *helper.Helper, config *rest.Config,
230242
}
231243

232244
// retrieveSequenceNumber probes a pod's galera instance for sequence number
233-
func retrieveSequenceNumber(ctx context.Context, helper *helper.Helper, config *rest.Config, instance *mariadbv1.Galera, pod *corev1.Pod) error {
234-
err := mariadb.ExecInPod(ctx, helper, config, instance.Namespace, pod.Name, "galera",
245+
func retrieveSequenceNumber(ctx context.Context, helper *helper.Helper, config *rest.Config, instance *mariadbv1.Galera, pod *corev1.Pod) (errStr []string, err error) {
246+
errStr = nil
247+
err = mariadb.ExecInPod(ctx, helper, config, instance.Namespace, pod.Name, "galera",
235248
[]string{"/bin/bash", "/var/lib/operator-scripts/detect_last_commit.sh"},
236-
func(stdout *bytes.Buffer, _ *bytes.Buffer) error {
237-
seqno := strings.TrimSuffix(stdout.String(), "\n")
238-
attr := mariadbv1.GaleraAttributes{
239-
Seqno: seqno,
249+
func(stdout *bytes.Buffer, stderr *bytes.Buffer) error {
250+
var attr mariadbv1.GaleraAttributes
251+
if err := json.Unmarshal(stdout.Bytes(), &attr); err != nil {
252+
return err
253+
}
254+
if stderr.Len() > 0 {
255+
errStr = strings.Split(strings.TrimSuffix(stderr.String(), "\n"), "\n")
240256
}
241257
instance.Status.Attributes[pod.Name] = attr
242258
return nil
243259
})
244-
return err
260+
return
245261
}
246262

247263
// clearPodAttributes clears information known by the operator about a pod
@@ -753,7 +769,7 @@ func (r *GaleraReconciler) Reconcile(ctx context.Context, req ctrl.Request) (res
753769
for _, pod := range getReadyPods(podList.Items) {
754770
name := pod.Name
755771
if _, found := instance.Status.Attributes[name]; found {
756-
log.Info("Galera started on", "pod", pod.Name)
772+
log.Info("Galera started", "pod", name)
757773
clearPodAttributes(instance, name)
758774
}
759775
}
@@ -793,21 +809,36 @@ func (r *GaleraReconciler) Reconcile(ctx context.Context, req ctrl.Request) (res
793809
// . any other status means the the pod is starting/restarting. We can't
794810
// exec into the pod yet, so we will probe it in another reconcile loop.
795811
if !instance.Status.Bootstrapped && !isBootstrapInProgress(instance) {
812+
var node string
813+
found := false
796814
for _, pod := range getRunningPodsMissingAttributes(ctx, podList.Items, instance, helper, r.config) {
797815
name := pod.Name
798816
util.LogForObject(helper, fmt.Sprintf("Pod %s running, retrieve seqno", name), instance)
799-
err := retrieveSequenceNumber(ctx, helper, r.config, instance, &pod)
817+
warn, err := retrieveSequenceNumber(ctx, helper, r.config, instance, &pod)
818+
if len(warn) > 0 {
819+
util.LogForObject(helper, fmt.Sprintf("Warning: %q", warn), instance)
820+
}
800821
if err != nil {
801-
log.Error(err, "Failed to retrieve seqno for ", "name", name)
822+
log.Error(err, fmt.Sprintf("Failed to retrieve seqno for %s", name))
802823
return ctrl.Result{}, err
803824
}
804-
log.Info("", "Pod", name, "seqno:", instance.Status.Attributes[name].Seqno)
825+
log.Info(fmt.Sprintf("Attributes retrieved for %s", name),
826+
"UUID", instance.Status.Attributes[name].UUID,
827+
"Seqno", instance.Status.Attributes[name].Seqno,
828+
"SafeToBootstrap", instance.Status.Attributes[name].SafeToBootstrap,
829+
)
830+
if instance.Status.Attributes[name].SafeToBootstrap {
831+
node = name
832+
found = true
833+
break
834+
}
805835
}
806836

807837
// Check if we have enough info to bootstrap the cluster now
808-
if (len(instance.Status.Attributes) > 0) &&
809-
(len(instance.Status.Attributes) == len(podList.Items)) {
810-
node := findBestCandidate(&instance.Status)
838+
if !found {
839+
node, found = findBestCandidate(instance)
840+
}
841+
if found {
811842
pod := getPodFromName(podList.Items, node)
812843
log.Info("Pushing gcomm URI to bootstrap", "pod", node)
813844
// Setting the gcomm attribute marks this pod as 'currently bootstrapping the cluster'

pkg/mariadb/statefulset.go

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ func StatefulSet(g *mariadbv1.Galera, configHash string) *appsv1.StatefulSet {
7878
},
7979
corev1.LabelHostname,
8080
)
81-
if g.Spec.NodeSelector != nil && len(g.Spec.NodeSelector) > 0 {
81+
if len(g.Spec.NodeSelector) > 0 {
8282
sts.Spec.Template.Spec.NodeSelector = g.Spec.NodeSelector
8383
}
8484

@@ -164,6 +164,13 @@ func getGaleraContainers(g *mariadbv1.Galera, configHash string) []corev1.Contai
164164
},
165165
},
166166
},
167+
Lifecycle: &corev1.Lifecycle{
168+
PreStop: &corev1.LifecycleHandler{
169+
Exec: &corev1.ExecAction{
170+
Command: []string{"/bin/bash", "/var/lib/operator-scripts/mysql_shutdown.sh"},
171+
},
172+
},
173+
},
167174
}}
168175
logSideCar := corev1.Container{
169176
Image: g.Spec.ContainerImage,

pkg/mariadb/volumes.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,10 @@ func getGaleraVolumes(g *mariadbv1.Galera) []corev1.Volume {
101101
Key: "mysql_probe.sh",
102102
Path: "mysql_probe.sh",
103103
},
104+
{
105+
Key: "mysql_shutdown.sh",
106+
Path: "mysql_shutdown.sh",
107+
},
104108
{
105109
Key: "detect_last_commit.sh",
106110
Path: "detect_last_commit.sh",

templates/galera/bin/detect_last_commit.sh

Lines changed: 64 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -8,37 +8,74 @@ recover_args="--datadir=/var/lib/mysql \
88
--skip-networking \
99
--wsrep-cluster-address=gcomm://localhost"
1010
recovery_file_regex='s/.*WSREP\:.*position\s*recovery.*--log_error='\''\([^'\'']*\)'\''.*/\1/p'
11-
recovered_position_regex='s/.*WSREP\:\s*[R|r]ecovered\s*position.*\:\(.*\)\s*$/\1/p'
11+
recovered_position_uuid_regex='s/.*WSREP\:\s*[R|r]ecovered\s*position\:\ \(.*\)\:.*$/\1/p'
12+
recovered_position_seqno_regex='s/.*WSREP\:\s*[R|r]ecovered\s*position.*\:\(.*\)\s*$/\1/p'
13+
14+
grastate_file=/var/lib/mysql/grastate.dat
15+
gvwstate_file=/var/lib/mysql/gvwstate.dat
16+
17+
uuid=""
18+
seqno=""
19+
safe_to_bootstrap=0
20+
no_grastate=0
21+
22+
function json_summary {
23+
declare -a out
24+
if [ -n "$uuid" ]; then out+=( "\"uuid\":\"$uuid\"" ); fi
25+
if [ -n "$seqno" ]; then out+=( "\"seqno\":\"$seqno\"" ); fi
26+
if [ $safe_to_bootstrap -ne 0 ]; then out+=( '"safe_to_bootstrap":true' ); fi
27+
if [ $no_grastate -ne 0 ]; then out+=( '"no_grastate":true' ); fi
28+
IFS=, ; echo "{${out[*]}}"
29+
}
30+
31+
trap json_summary EXIT
1232

1333
# codership/galera#354
1434
# Some ungraceful shutdowns can leave an empty gvwstate.dat on
1535
# disk. This will prevent galera to join the cluster if it is
1636
# configured to attempt PC recovery. Removing that file makes the
1737
# node fall back to the normal, unoptimized joining process.
18-
if [ -f /var/lib/mysql/gvwstate.dat ] && \
19-
[ ! -s /var/lib/mysql/gvwstate.dat ]; then
20-
echo "empty /var/lib/mysql/gvwstate.dat detected, removing it to prevent PC recovery failure at next restart" >&2
21-
rm -f /var/lib/mysql/gvwstate.dat
38+
if [ -f $gvwstate_file ] && \
39+
[ ! -s $gvwstate_file ]; then
40+
echo "empty $gvwstate_file detected, removing it to prevent PC recovery failure at next restart" >&2
41+
rm -f $gvwstate_file
42+
fi
43+
44+
# Attempt to retrieve the seqno information and safe_to_bootstrap hint
45+
# from the saved state file on disk
46+
47+
if [ -f $grastate_file ]; then
48+
uuid="$(cat $grastate_file | sed -n 's/^uuid.\s*\(.*\)\s*$/\1/p')"
49+
seqno="$(cat $grastate_file | sed -n 's/^seqno.\s*\(.*\)\s*$/\1/p')"
50+
safe_to_bootstrap="$(cat $grastate_file | sed -n 's/^safe_to_bootstrap.\s*\(.*\)\s*$/\1/p')"
51+
52+
if [ -z "$uuid" ] || \
53+
[ "$uuid" = "00000000-0000-0000-0000-000000000000" ]; then
54+
safe_to_bootstrap=0
55+
fi
56+
if [ "$safe_to_bootstrap" = "1" ]; then
57+
if [ -z "$seqno" ] || [ "$seqno" = "-1" ]; then
58+
safe_to_bootstrap=0
59+
fi
60+
fi
2261
fi
2362

24-
echo "attempting to detect last commit version by reading grastate.dat" >&2
25-
last_commit="$(cat /var/lib/mysql/grastate.dat | sed -n 's/^seqno.\s*\(.*\)\s*$/\1/p')"
26-
if [ -z "$last_commit" ] || [ "$last_commit" = "-1" ]; then
63+
# If the seqno could not be retrieved, inspect the mysql database
64+
65+
if [ -z "$seqno" ] || [ "$seqno" = "-1" ]; then
2766
tmp=$(mktemp)
2867
chown mysql:mysql $tmp
2968

30-
# if we pass here because grastate.dat doesn't exist,
31-
# try not to bootstrap from this node if possible
32-
# if [ ! -f /var/lib/mysql/grastate.dat ]; then
33-
# set_no_grastate
34-
# fi
35-
36-
echo "now attempting to detect last commit version using 'mysqld_safe --wsrep-recover'" >&2
69+
# if we pass here because grastate.dat doesn't exist, report it
70+
if [ ! -f /var/lib/mysql/grastate.dat ]; then
71+
no_grastate=1
72+
fi
3773

38-
mysqld_safe --wsrep-recover $recover_args --log-error=$tmp 1>&2
74+
mysqld_safe --wsrep-recover $recover_args --log-error=$tmp >/dev/null
3975

40-
last_commit="$(cat $tmp | sed -n $recovered_position_regex | tail -1)"
41-
if [ -z "$last_commit" ]; then
76+
seqno="$(cat $tmp | sed -n "$recovered_position_seqno_regex" | tail -1)"
77+
uuid="$(cat $tmp | sed -n "$recovered_position_uuid_regex" | tail -1)"
78+
if [ -z "$seqno" ]; then
4279
# Galera uses InnoDB's 2pc transactions internally. If
4380
# server was stopped in the middle of a replication, the
4481
# recovery may find a "prepared" XA transaction in the
@@ -52,25 +89,26 @@ if [ -z "$last_commit" ] || [ "$last_commit" = "-1" ]; then
5289
# since the DB will get resynchronized anyway
5390
echo "local node was not shutdown properly. Rollback stuck transaction with --tc-heuristic-recover" >&2
5491
mysqld_safe --wsrep-recover $recover_args \
55-
--tc-heuristic-recover=rollback --log-error=$tmp 2>/dev/null
92+
--tc-heuristic-recover=rollback --log-error=$tmp >/dev/null 2>&1
5693

57-
last_commit="$(cat $tmp | sed -n $recovered_position_regex | tail -1)"
58-
if [ ! -z "$last_commit" ]; then
94+
seqno="$(cat $tmp | sed -n "$recovered_position_seqno_regex" | tail -1)"
95+
uuid="$(cat $tmp | sed -n "$recovered_position_uuid_regex" | tail -1)"
96+
if [ ! -z "$seqno" ]; then
5997
echo "State recovered. force SST at next restart for full resynchronization" >&2
6098
rm -f /var/lib/mysql/grastate.dat
6199
# try not to bootstrap from this node if possible
62-
# set_no_grastate
100+
no_grastate=1
63101
fi
64102
fi
65103
fi
66104
fi
67105
rm -f $tmp
68106
fi
69107

70-
if [ ! -z "$last_commit" ]; then
71-
echo "$last_commit"
72-
exit 0
73-
else
108+
109+
if [ -z "$seqno" ]; then
74110
echo "Unable to detect last known write sequence number" >&2
75111
exit 1
76112
fi
113+
114+
# json data is printed on exit
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
#!/bin/bash
2+
3+
# NOTE(dciabrin) we might use downward API to populate those in the future
4+
PODNAME=$HOSTNAME
5+
SERVICE=${PODNAME/-galera-[0-9]*/}
6+
7+
# API server config
8+
APISERVER=https://kubernetes.default.svc
9+
SERVICEACCOUNT=/var/run/secrets/kubernetes.io/serviceaccount
10+
NAMESPACE=$(cat ${SERVICEACCOUNT}/namespace)
11+
TOKEN=$(cat ${SERVICEACCOUNT}/token)
12+
CACERT=${SERVICEACCOUNT}/ca.crt
13+
14+
function log() {
15+
echo "$(date +%F_%H_%M_%S) `basename $0` $*"
16+
}
17+
18+
# Log in mariadb's log file if configured, so the output of this script
19+
# is captured when logToDisk is enabled in the galera CR
20+
LOGFILE=$(my_print_defaults mysqld | grep log-error | cut -d= -f2)
21+
if [ -f "$LOGFILE" ]; then
22+
exec &> >(cat >> "$LOGFILE") 2>&1
23+
else
24+
exec &> >(cat >> /proc/1/fd/1) 2>&1
25+
fi
26+
27+
# On update, k8s performs a rolling restart, but on resource deletion,
28+
# all pods are deleted concurrently due to the fact that we require
29+
# PodManagementPolicy: appsv1.ParallelPodManagement for bootstrapping
30+
# the cluster. So try to stop the nodes sequentially so that
31+
# the last galera node stopped can set a "safe_to_bootstrap" flag.
32+
33+
if curl -s --cacert ${CACERT} --header "Content-Type:application/json" --header "Authorization: Bearer ${TOKEN}" -X GET ${APISERVER}/api/v1/namespaces/openstack/pods/${PODNAME} | grep -q '"code": *401'; then
34+
log "Galera resource is being deleted"
35+
nth=$(( ${PODNAME//*-/} + 1 ))
36+
while : ; do
37+
size=$(mysql -uroot -p"${DB_ROOT_PASSWORD}" -sNEe "show status like 'wsrep_cluster_size';" | tail -1)
38+
if [ ${size:-0} -gt $nth ]; then
39+
log "Waiting for cluster to scale down"
40+
sleep 2
41+
else
42+
break
43+
fi
44+
done
45+
fi
46+
47+
log "Shutting down local galera node"
48+
mysqladmin -uroot -p"${DB_ROOT_PASSWORD}" shutdown

0 commit comments

Comments
 (0)