Skip to content

Commit 200b937

Browse files
Merge pull request #272 from dciabrin/safe_to_bootstrap
Fast bootstrap using safe_to_bootstrap flag
2 parents 6f4b7a6 + 67d53bd commit 200b937

File tree

8 files changed

+201
-45
lines changed

8 files changed

+201
-45
lines changed

api/bases/mariadb.openstack.org_galeras.yaml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,10 +117,21 @@ spec:
117117
gcomm:
118118
description: Gcomm URI used to connect to the galera cluster
119119
type: string
120+
no_grastate:
121+
description: This galera node has its state recovered from the
122+
DB
123+
type: boolean
124+
safe_to_bootstrap:
125+
description: This galera node can bootstrap a galera cluster
126+
type: boolean
120127
seqno:
121128
description: Last recorded replication sequence number in the
122129
DB
123130
type: string
131+
uuid:
132+
description: UUID of the partition that is seen by the galera
133+
node
134+
type: string
124135
required:
125136
- seqno
126137
type: object

api/v1beta1/galera_types.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,8 +84,14 @@ type GaleraSpecCore struct {
8484

8585
// GaleraAttributes holds startup information for a Galera host
8686
type GaleraAttributes struct {
87+
// UUID of the partition that is seen by the galera node
88+
UUID string `json:"uuid,omitempty"`
8789
// Last recorded replication sequence number in the DB
8890
Seqno string `json:"seqno"`
91+
// This galera node can bootstrap a galera cluster
92+
SafeToBootstrap bool `json:"safe_to_bootstrap,omitempty"`
93+
// This galera node has its state recovered from the DB
94+
NoGrastate bool `json:"no_grastate,omitempty"`
8995
// Gcomm URI used to connect to the galera cluster
9096
Gcomm string `json:"gcomm,omitempty"`
9197
// Identifier of the container at the time the gcomm URI was injected

config/crd/bases/mariadb.openstack.org_galeras.yaml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,10 +117,21 @@ spec:
117117
gcomm:
118118
description: Gcomm URI used to connect to the galera cluster
119119
type: string
120+
no_grastate:
121+
description: This galera node has its state recovered from the
122+
DB
123+
type: boolean
124+
safe_to_bootstrap:
125+
description: This galera node can bootstrap a galera cluster
126+
type: boolean
120127
seqno:
121128
description: Last recorded replication sequence number in the
122129
DB
123130
type: string
131+
uuid:
132+
description: UUID of the partition that is seen by the galera
133+
node
134+
type: string
124135
required:
125136
- seqno
126137
type: object

controllers/galera_controller.go

Lines changed: 49 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ package controllers
1919
import (
2020
"bytes"
2121
"context"
22+
"encoding/json"
2223
"fmt"
2324
"sort"
2425
"strconv"
@@ -94,20 +95,31 @@ func GetLog(ctx context.Context, controller string) logr.Logger {
9495
//
9596

9697
// findBestCandidate returns the node with the lowest seqno
97-
func findBestCandidate(status *mariadbv1.GaleraStatus) string {
98-
sortednodes := maps.Keys(status.Attributes)
98+
func findBestCandidate(g *mariadbv1.Galera) (node string, found bool) {
99+
sortednodes := maps.Keys(g.Status.Attributes)
99100
sort.Strings(sortednodes)
100101
bestnode := ""
101102
bestseqno := -1
102103
for _, node := range sortednodes {
103-
seqno := status.Attributes[node].Seqno
104+
// On clean shutdown, galera sets the last
105+
// stopped node as 'safe to bootstrap', so use
106+
// this hint when we can
107+
if g.Status.Attributes[node].SafeToBootstrap {
108+
return node, true
109+
}
110+
seqno := g.Status.Attributes[node].Seqno
104111
intseqno, _ := strconv.Atoi(seqno)
105112
if intseqno >= bestseqno {
106113
bestnode = node
107114
bestseqno = intseqno
108115
}
109116
}
110-
return bestnode //"galera-0"
117+
// if we pass here, a candidate is only valid if we
118+
// inspected all the expected replicas (e.g. typically 3)
119+
if len(g.Status.Attributes) != int(*g.Spec.Replicas) {
120+
return "", false
121+
}
122+
return bestnode, true //"galera-0"
111123
}
112124

113125
// buildGcommURI builds a gcomm URI for a galera instance
@@ -240,18 +252,22 @@ func injectGcommURI(ctx context.Context, h *helper.Helper, config *rest.Config,
240252
}
241253

242254
// retrieveSequenceNumber probes a pod's galera instance for sequence number
243-
func retrieveSequenceNumber(ctx context.Context, helper *helper.Helper, config *rest.Config, instance *mariadbv1.Galera, pod *corev1.Pod) error {
244-
err := mariadb.ExecInPod(ctx, helper, config, instance.Namespace, pod.Name, "galera",
255+
func retrieveSequenceNumber(ctx context.Context, helper *helper.Helper, config *rest.Config, instance *mariadbv1.Galera, pod *corev1.Pod) (errStr []string, err error) {
256+
errStr = nil
257+
err = mariadb.ExecInPod(ctx, helper, config, instance.Namespace, pod.Name, "galera",
245258
[]string{"/bin/bash", "/var/lib/operator-scripts/detect_last_commit.sh"},
246-
func(stdout *bytes.Buffer, _ *bytes.Buffer) error {
247-
seqno := strings.TrimSuffix(stdout.String(), "\n")
248-
attr := mariadbv1.GaleraAttributes{
249-
Seqno: seqno,
259+
func(stdout *bytes.Buffer, stderr *bytes.Buffer) error {
260+
var attr mariadbv1.GaleraAttributes
261+
if err := json.Unmarshal(stdout.Bytes(), &attr); err != nil {
262+
return err
263+
}
264+
if stderr.Len() > 0 {
265+
errStr = strings.Split(strings.TrimSuffix(stderr.String(), "\n"), "\n")
250266
}
251267
instance.Status.Attributes[pod.Name] = attr
252268
return nil
253269
})
254-
return err
270+
return
255271
}
256272

257273
// clearPodAttributes clears information known by the operator about a pod
@@ -737,7 +753,7 @@ func (r *GaleraReconciler) Reconcile(ctx context.Context, req ctrl.Request) (res
737753
for _, pod := range getReadyPods(podList.Items) {
738754
name := pod.Name
739755
if _, found := instance.Status.Attributes[name]; found {
740-
log.Info("Galera started on", "pod", pod.Name)
756+
log.Info("Galera started", "pod", name)
741757
clearPodAttributes(instance, name)
742758
}
743759
}
@@ -777,21 +793,36 @@ func (r *GaleraReconciler) Reconcile(ctx context.Context, req ctrl.Request) (res
777793
// . any other status means the the pod is starting/restarting. We can't
778794
// exec into the pod yet, so we will probe it in another reconcile loop.
779795
if !instance.Status.Bootstrapped && !isBootstrapInProgress(instance) {
796+
var node string
797+
found := false
780798
for _, pod := range getRunningPodsMissingAttributes(ctx, podList.Items, instance, helper, r.config) {
781799
name := pod.Name
782800
util.LogForObject(helper, fmt.Sprintf("Pod %s running, retrieve seqno", name), instance)
783-
err := retrieveSequenceNumber(ctx, helper, r.config, instance, &pod)
801+
warn, err := retrieveSequenceNumber(ctx, helper, r.config, instance, &pod)
802+
if len(warn) > 0 {
803+
util.LogForObject(helper, fmt.Sprintf("Warning: %q", warn), instance)
804+
}
784805
if err != nil {
785-
log.Error(err, "Failed to retrieve seqno for ", "name", name)
806+
log.Error(err, fmt.Sprintf("Failed to retrieve seqno for %s", name))
786807
return ctrl.Result{}, err
787808
}
788-
log.Info("", "Pod", name, "seqno:", instance.Status.Attributes[name].Seqno)
809+
log.Info(fmt.Sprintf("Attributes retrieved for %s", name),
810+
"UUID", instance.Status.Attributes[name].UUID,
811+
"Seqno", instance.Status.Attributes[name].Seqno,
812+
"SafeToBootstrap", instance.Status.Attributes[name].SafeToBootstrap,
813+
)
814+
if instance.Status.Attributes[name].SafeToBootstrap {
815+
node = name
816+
found = true
817+
break
818+
}
789819
}
790820

791821
// Check if we have enough info to bootstrap the cluster now
792-
if (len(instance.Status.Attributes) > 0) &&
793-
(len(instance.Status.Attributes) == len(podList.Items)) {
794-
node := findBestCandidate(&instance.Status)
822+
if !found {
823+
node, found = findBestCandidate(instance)
824+
}
825+
if found {
795826
pod := getPodFromName(podList.Items, node)
796827
log.Info("Pushing gcomm URI to bootstrap", "pod", node)
797828
// Setting the gcomm attribute marks this pod as 'currently bootstrapping the cluster'

pkg/mariadb/statefulset.go

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ func StatefulSet(g *mariadbv1.Galera, configHash string) *appsv1.StatefulSet {
7878
},
7979
corev1.LabelHostname,
8080
)
81-
if g.Spec.NodeSelector != nil && len(g.Spec.NodeSelector) > 0 {
81+
if len(g.Spec.NodeSelector) > 0 {
8282
sts.Spec.Template.Spec.NodeSelector = g.Spec.NodeSelector
8383
}
8484

@@ -164,6 +164,13 @@ func getGaleraContainers(g *mariadbv1.Galera, configHash string) []corev1.Contai
164164
},
165165
},
166166
},
167+
Lifecycle: &corev1.Lifecycle{
168+
PreStop: &corev1.LifecycleHandler{
169+
Exec: &corev1.ExecAction{
170+
Command: []string{"/bin/bash", "/var/lib/operator-scripts/mysql_shutdown.sh"},
171+
},
172+
},
173+
},
167174
}}
168175
logSideCar := corev1.Container{
169176
Image: g.Spec.ContainerImage,

pkg/mariadb/volumes.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,10 @@ func getGaleraVolumes(g *mariadbv1.Galera) []corev1.Volume {
101101
Key: "mysql_probe.sh",
102102
Path: "mysql_probe.sh",
103103
},
104+
{
105+
Key: "mysql_shutdown.sh",
106+
Path: "mysql_shutdown.sh",
107+
},
104108
{
105109
Key: "detect_last_commit.sh",
106110
Path: "detect_last_commit.sh",

templates/galera/bin/detect_last_commit.sh

Lines changed: 64 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -8,37 +8,74 @@ recover_args="--datadir=/var/lib/mysql \
88
--skip-networking \
99
--wsrep-cluster-address=gcomm://localhost"
1010
recovery_file_regex='s/.*WSREP\:.*position\s*recovery.*--log_error='\''\([^'\'']*\)'\''.*/\1/p'
11-
recovered_position_regex='s/.*WSREP\:\s*[R|r]ecovered\s*position.*\:\(.*\)\s*$/\1/p'
11+
recovered_position_uuid_regex='s/.*WSREP\:\s*[R|r]ecovered\s*position\:\ \(.*\)\:.*$/\1/p'
12+
recovered_position_seqno_regex='s/.*WSREP\:\s*[R|r]ecovered\s*position.*\:\(.*\)\s*$/\1/p'
13+
14+
grastate_file=/var/lib/mysql/grastate.dat
15+
gvwstate_file=/var/lib/mysql/gvwstate.dat
16+
17+
uuid=""
18+
seqno=""
19+
safe_to_bootstrap=0
20+
no_grastate=0
21+
22+
function json_summary {
23+
declare -a out
24+
if [ -n "$uuid" ]; then out+=( "\"uuid\":\"$uuid\"" ); fi
25+
if [ -n "$seqno" ]; then out+=( "\"seqno\":\"$seqno\"" ); fi
26+
if [ $safe_to_bootstrap -ne 0 ]; then out+=( '"safe_to_bootstrap":true' ); fi
27+
if [ $no_grastate -ne 0 ]; then out+=( '"no_grastate":true' ); fi
28+
IFS=, ; echo "{${out[*]}}"
29+
}
30+
31+
trap json_summary EXIT
1232

1333
# codership/galera#354
1434
# Some ungraceful shutdowns can leave an empty gvwstate.dat on
1535
# disk. This will prevent galera to join the cluster if it is
1636
# configured to attempt PC recovery. Removing that file makes the
1737
# node fall back to the normal, unoptimized joining process.
18-
if [ -f /var/lib/mysql/gvwstate.dat ] && \
19-
[ ! -s /var/lib/mysql/gvwstate.dat ]; then
20-
echo "empty /var/lib/mysql/gvwstate.dat detected, removing it to prevent PC recovery failure at next restart" >&2
21-
rm -f /var/lib/mysql/gvwstate.dat
38+
if [ -f $gvwstate_file ] && \
39+
[ ! -s $gvwstate_file ]; then
40+
echo "empty $gvwstate_file detected, removing it to prevent PC recovery failure at next restart" >&2
41+
rm -f $gvwstate_file
42+
fi
43+
44+
# Attempt to retrieve the seqno information and safe_to_bootstrap hint
45+
# from the saved state file on disk
46+
47+
if [ -f $grastate_file ]; then
48+
uuid="$(cat $grastate_file | sed -n 's/^uuid.\s*\(.*\)\s*$/\1/p')"
49+
seqno="$(cat $grastate_file | sed -n 's/^seqno.\s*\(.*\)\s*$/\1/p')"
50+
safe_to_bootstrap="$(cat $grastate_file | sed -n 's/^safe_to_bootstrap.\s*\(.*\)\s*$/\1/p')"
51+
52+
if [ -z "$uuid" ] || \
53+
[ "$uuid" = "00000000-0000-0000-0000-000000000000" ]; then
54+
safe_to_bootstrap=0
55+
fi
56+
if [ "$safe_to_bootstrap" = "1" ]; then
57+
if [ -z "$seqno" ] || [ "$seqno" = "-1" ]; then
58+
safe_to_bootstrap=0
59+
fi
60+
fi
2261
fi
2362

24-
echo "attempting to detect last commit version by reading grastate.dat" >&2
25-
last_commit="$(cat /var/lib/mysql/grastate.dat | sed -n 's/^seqno.\s*\(.*\)\s*$/\1/p')"
26-
if [ -z "$last_commit" ] || [ "$last_commit" = "-1" ]; then
63+
# If the seqno could not be retrieved, inspect the mysql database
64+
65+
if [ -z "$seqno" ] || [ "$seqno" = "-1" ]; then
2766
tmp=$(mktemp)
2867
chown mysql:mysql $tmp
2968

30-
# if we pass here because grastate.dat doesn't exist,
31-
# try not to bootstrap from this node if possible
32-
# if [ ! -f /var/lib/mysql/grastate.dat ]; then
33-
# set_no_grastate
34-
# fi
35-
36-
echo "now attempting to detect last commit version using 'mysqld_safe --wsrep-recover'" >&2
69+
# if we pass here because grastate.dat doesn't exist, report it
70+
if [ ! -f /var/lib/mysql/grastate.dat ]; then
71+
no_grastate=1
72+
fi
3773

38-
mysqld_safe --wsrep-recover $recover_args --log-error=$tmp 1>&2
74+
mysqld_safe --wsrep-recover $recover_args --log-error=$tmp >/dev/null
3975

40-
last_commit="$(cat $tmp | sed -n $recovered_position_regex | tail -1)"
41-
if [ -z "$last_commit" ]; then
76+
seqno="$(cat $tmp | sed -n "$recovered_position_seqno_regex" | tail -1)"
77+
uuid="$(cat $tmp | sed -n "$recovered_position_uuid_regex" | tail -1)"
78+
if [ -z "$seqno" ]; then
4279
# Galera uses InnoDB's 2pc transactions internally. If
4380
# server was stopped in the middle of a replication, the
4481
# recovery may find a "prepared" XA transaction in the
@@ -52,25 +89,26 @@ if [ -z "$last_commit" ] || [ "$last_commit" = "-1" ]; then
5289
# since the DB will get resynchronized anyway
5390
echo "local node was not shutdown properly. Rollback stuck transaction with --tc-heuristic-recover" >&2
5491
mysqld_safe --wsrep-recover $recover_args \
55-
--tc-heuristic-recover=rollback --log-error=$tmp 2>/dev/null
92+
--tc-heuristic-recover=rollback --log-error=$tmp >/dev/null 2>&1
5693

57-
last_commit="$(cat $tmp | sed -n $recovered_position_regex | tail -1)"
58-
if [ ! -z "$last_commit" ]; then
94+
seqno="$(cat $tmp | sed -n "$recovered_position_seqno_regex" | tail -1)"
95+
uuid="$(cat $tmp | sed -n "$recovered_position_uuid_regex" | tail -1)"
96+
if [ ! -z "$seqno" ]; then
5997
echo "State recovered. force SST at next restart for full resynchronization" >&2
6098
rm -f /var/lib/mysql/grastate.dat
6199
# try not to bootstrap from this node if possible
62-
# set_no_grastate
100+
no_grastate=1
63101
fi
64102
fi
65103
fi
66104
fi
67105
rm -f $tmp
68106
fi
69107

70-
if [ ! -z "$last_commit" ]; then
71-
echo "$last_commit"
72-
exit 0
73-
else
108+
109+
if [ -z "$seqno" ]; then
74110
echo "Unable to detect last known write sequence number" >&2
75111
exit 1
76112
fi
113+
114+
# json data is printed on exit
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
#!/bin/bash
2+
3+
# NOTE(dciabrin) we might use downward API to populate those in the future
4+
PODNAME=$HOSTNAME
5+
SERVICE=${PODNAME/-galera-[0-9]*/}
6+
7+
# API server config
8+
APISERVER=https://kubernetes.default.svc
9+
SERVICEACCOUNT=/var/run/secrets/kubernetes.io/serviceaccount
10+
NAMESPACE=$(cat ${SERVICEACCOUNT}/namespace)
11+
TOKEN=$(cat ${SERVICEACCOUNT}/token)
12+
CACERT=${SERVICEACCOUNT}/ca.crt
13+
14+
function log() {
15+
echo "$(date +%F_%H_%M_%S) `basename $0` $*"
16+
}
17+
18+
# Log in mariadb's log file if configured, so the output of this script
19+
# is captured when logToDisk is enabled in the galera CR
20+
LOGFILE=$(my_print_defaults mysqld | grep log-error | cut -d= -f2)
21+
if [ -f "$LOGFILE" ]; then
22+
exec &> >(cat >> "$LOGFILE") 2>&1
23+
else
24+
exec &> >(cat >> /proc/1/fd/1) 2>&1
25+
fi
26+
27+
# On update, k8s performs a rolling restart, but on resource deletion,
28+
# all pods are deleted concurrently due to the fact that we require
29+
# PodManagementPolicy: appsv1.ParallelPodManagement for bootstrapping
30+
# the cluster. So try to stop the nodes sequentially so that
31+
# the last galera node stopped can set a "safe_to_bootstrap" flag.
32+
33+
if curl -s --cacert ${CACERT} --header "Content-Type:application/json" --header "Authorization: Bearer ${TOKEN}" -X GET ${APISERVER}/api/v1/namespaces/openstack/pods/${PODNAME} | grep -q '"code": *401'; then
34+
log "Galera resource is being deleted"
35+
nth=$(( ${PODNAME//*-/} + 1 ))
36+
while : ; do
37+
size=$(mysql -uroot -p"${DB_ROOT_PASSWORD}" -sNEe "show status like 'wsrep_cluster_size';" | tail -1)
38+
if [ ${size:-0} -gt $nth ]; then
39+
log "Waiting for cluster to scale down"
40+
sleep 2
41+
else
42+
break
43+
fi
44+
done
45+
fi
46+
47+
log "Shutting down local galera node"
48+
mysqladmin -uroot -p"${DB_ROOT_PASSWORD}" shutdown

0 commit comments

Comments
 (0)