Skip to content

Commit 71e09fe

Browse files
dciabrinopenshift-cherrypick-robot
authored andcommitted
Improve the failover of galera service
When a galera node is in the process of shutting down (e.g. during a rolling restart caused by a minor update), the node is unable to serve SQL queries, however it is still connected to clients. This confuses clients who get unexpected SQL status [1] and prevent them from retrying their queries, causing unexpected errors down the road. Improve the pod stop pre-hook to failover the active endpoint to another pod prior to shutting down the galera server, and kill connected clients to force them to reconnect to the new active endpoint. At this stage, the galera server can be safely shutdown as no client will see its WSREP state update. Also update the failover script: 1) when no endpoint is available, ensure no traffic is going through any pod. 2) do not trigger a endpoint failover as long as the current endpoint targets a galera node that is still part of the primary partition (i.e. it is still able to serve traffic). [1] 'WSREP has not yet prepared node for application use' Jira: OSPRH-11488
1 parent 352c579 commit 71e09fe

File tree

2 files changed

+102
-23
lines changed

2 files changed

+102
-23
lines changed

templates/galera/bin/mysql_shutdown.sh

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,5 +44,16 @@ if curl -s --cacert ${CACERT} --header "Content-Type:application/json" --header
4444
done
4545
fi
4646

47-
log "Shutting down local galera node"
47+
log "Initiating orchestrated shutdown of the local galera node"
48+
49+
log "Failover service to another available galera node"
50+
bash $(dirname $0)/mysql_wsrep_notify.sh --status failover
51+
52+
log "Close all active connections to this local galera node"
53+
# filter out system and localhost connections, only consider clients with a port in the host field
54+
# from that point, clients will automatically reconnect to another node
55+
CLIENTS=$(mysql -uroot -p${DB_ROOT_PASSWORD} -nN -e "select id from information_schema.processlist where host like '%:%';")
56+
echo -n "$CLIENTS" | tr '\n' ',' | xargs mysqladmin -uroot -p${DB_ROOT_PASSWORD} kill
57+
58+
log "Shutdown local server"
4859
mysqladmin -uroot -p"${DB_ROOT_PASSWORD}" shutdown

templates/galera/bin/mysql_wsrep_notify.sh

Lines changed: 90 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -32,18 +32,26 @@ function log_error() {
3232
function mysql_get_status {
3333
local name=$1
3434
mysql -nNE -uroot -p"${DB_ROOT_PASSWORD}" -e "show status like '${name}';" | tail -1
35-
if [ $? != 0 ]; then
36-
log_error "could not get value of mysql variable '${name}' (rc=$?)"
37-
return 1
38-
fi
35+
local rc=$?
36+
[ $rc = 0 ] || log_error "could not get value of mysql variable '${name}' (rc=$rc)"
37+
}
38+
39+
function mysql_get_members {
40+
mysql -nN -uroot -p"${DB_ROOT_PASSWORD}" -e "select node_name from mysql.wsrep_cluster_members;"
41+
local rc=$?
42+
[ $rc = 0 ] || log_error "could not get cluster members from mysql' (rc=$rc)"
3943
}
4044

41-
# Refresh environment variables with the latest WSREP state from mysql
45+
# When optional script parameters are not provided, set up the environment
46+
# variables with the latest WSREP state retrieved from mysql
4247
function mysql_probe_state {
43-
UUID=$(mysql_get_status wsrep_gcomm_uuid)
44-
PARTITION=$(mysql_get_status wsrep_cluster_status)
45-
INDEX=$(mysql_get_status wsrep_local_index)
46-
SIZE=$(mysql_get_status wsrep_cluster_size)
48+
[ "$1" = "reprobe" ] && unset UUID PARTITION INDEX SIZE MEMBERS
49+
: ${UUID=$(mysql_get_status wsrep_gcomm_uuid)}
50+
: ${PARTITION=$(mysql_get_status wsrep_cluster_status)}
51+
: ${INDEX=$(mysql_get_status wsrep_local_index)}
52+
: ${SIZE=$(mysql_get_status wsrep_cluster_size)}
53+
: ${MEMBERS=$(mysql_get_members)}
54+
[ -n "${UUID}" -a -n "${PARTITION}" -a -n "${INDEX}" -a -n "${SIZE}" -a -n "${MEMBERS}" ]
4755
}
4856

4957
# REST API call to the k8s API server
@@ -83,12 +91,10 @@ function api_server {
8391
# Update the service's active endpoint
8492
# (parse JSON with python3 as we don't have jq in the container image)
8593
function service_endpoint {
86-
local endpoint=$1
87-
if [ -n "${endpoint}" ]; then
88-
python3 -c 'import json,sys;s=json.load(sys.stdin);s["spec"]["selector"]["statefulset.kubernetes.io/pod-name"]="'${endpoint}'";print(json.dumps(s,indent=2))'
89-
else
90-
python3 -c 'import json,sys;s=json.load(sys.stdin);s["spec"]["selector"].pop("statefulset.kubernetes.io/pod-name", None);print(json.dumps(s,indent=2))'
91-
fi
94+
local endpoint="$1"
95+
# note: empty endpoint means "block incoming traffic", so the selector must still
96+
# be present, otherwise k8s would balance incoming traffic to _any_ available pod.
97+
python3 -c 'import json,sys;s=json.load(sys.stdin);s["spec"]["selector"]["statefulset.kubernetes.io/pod-name"]="'${endpoint}'";print(json.dumps(s,indent=2))'
9298
[ $? == 0 ] || log_error "Could not parse json endpoint (rc=$?)"
9399
}
94100

@@ -123,7 +129,7 @@ function retry {
123129
retries=$((retries - 1))
124130
# reprobe mysql state now, as if the cluster state changed since
125131
# the start of this script, we might not need to retry the action
126-
mysql_probe_state
132+
mysql_probe_state reprobe
127133
done
128134
if [ $rc -ne 0 ]; then
129135
log_error "Could not run action after ${RETRIES} tries. Stop retrying."
@@ -149,6 +155,11 @@ function reconfigure_service_endpoint {
149155

150156
CURRENT_ENDPOINT=$(echo "$CURRENT_SVC" | parse_output '["spec"]["selector"].get("statefulset.kubernetes.io/pod-name","")')
151157
[ $? == 0 ] || return 1
158+
# do not reconfigure endpoint if unecessary, to avoid client disconnections
159+
if [ -n "${CURRENT_ENDPOINT}" ] && echo "$MEMBERS" | grep -q "^${CURRENT_ENDPOINT}\$"; then
160+
log "Active endpoint ${CURRENT_ENDPOINT} is still part of the primary partition. Nothing to be done."
161+
return 0
162+
fi
152163
if [ "${CURRENT_ENDPOINT}" == "${PODNAME}" ]; then
153164
log "Node ${PODNAME} is currently the active endpoint for service ${SERVICE}. Nothing to be done."
154165
return 0
@@ -164,6 +175,39 @@ function reconfigure_service_endpoint {
164175
return 0
165176
}
166177

178+
## Failover to another node if we are the current Active endpoint
179+
function failover_service_endpoint {
180+
if [ $PARTITION != "Primary" ]; then
181+
log "Node ${PODNAME} is not the Primary partion. Nothing to be done."
182+
return 0
183+
fi
184+
185+
CURRENT_SVC=$(api_server GET "$SERVICE")
186+
local rc=$?
187+
[ $rc == 0 ] || return $rc
188+
189+
CURRENT_ENDPOINT=$(echo "$CURRENT_SVC" | parse_output '["spec"]["selector"].get("statefulset.kubernetes.io/pod-name","")')
190+
[ $? == 0 ] || return 1
191+
if [ "${CURRENT_ENDPOINT}" != "${PODNAME}" ]; then
192+
log "Node ${PODNAME} is not the active endpoint. Nothing to be done."
193+
return 0
194+
fi
195+
# select the first available node in the primary partition to be the failover endpoint
196+
NEW_ENDPOINT=$(echo "$MEMBERS" | grep -v "${PODNAME}" | head -1)
197+
if [ -z "${NEW_ENDPOINT}" ]; then
198+
log "No other available node to become the active endpoint."
199+
fi
200+
201+
NEW_SVC=$(echo "$CURRENT_SVC" | service_endpoint "$NEW_ENDPOINT")
202+
[ $? == 0 ] || return 1
203+
204+
log "Configuring a new active endpoint for service ${SERVICE}: '${CURRENT_ENDPOINT}' -> '${NEW_ENDPOINT}'"
205+
UPDATE_RESULT=$(echo "$NEW_SVC" | api_server PUT "$SERVICE")
206+
[ $? == 0 ] || return 1
207+
208+
return 0
209+
}
210+
167211
## Change the Active endpoint from the service
168212
function remove_service_endpoint {
169213
CURRENT_SVC=$(api_server GET "$SERVICE")
@@ -194,17 +238,29 @@ function remove_service_endpoint {
194238
log "called with args: $*"
195239

196240
# Galera always calls script with --status argument
197-
# All other arguments (uuid,partition,index...) are optional,
198-
# so get those values by probing mysql directly
199-
STATUS=""
200-
PARTITION=""
201-
INDEX=""
241+
# All other optional arguments (uuid,partition,index...):
242+
# UUID: cluster's current UUID
243+
# MEMBERS: galera node connected to the cluster
244+
# SIZE: number of nodes in the cluster
245+
# INDEX: member index in the cluster
246+
# PARTITION: cluster partition we're in (Primary, Non-primary)
202247
while [ $# -gt 0 ]; do
203248
case $1 in
204249
--status)
205250
STATUS=$2
206251
shift;;
207-
--uuid|--members|--primary|--index)
252+
--members)
253+
MEMBERS=$(echo "$2" | tr ',' '\n' | cut -d/ -f2)
254+
SIZE=$(echo "$MEMBERS" | wc -l)
255+
shift;;
256+
--primary)
257+
[ "$2" = "yes" ] && PARTITION="Primary"
258+
[ "$2" = "no" ] && PARTITION="Non-primary"
259+
shift;;
260+
--index)
261+
INDEX=$2
262+
shift;;
263+
--uuid)
208264
shift;;
209265
esac
210266
shift
@@ -215,6 +271,15 @@ if [ -z "${STATUS}" ]; then
215271
exit 1
216272
fi
217273

274+
# Contition: ask for a failover. This should be called when mysql is running
275+
if echo "${STATUS}" | grep -i -q -e 'failover'; then
276+
mysql_probe_state
277+
if [ $? != 0 ]; then
278+
log_error "Could not probe missing mysql information. Aborting"
279+
fi
280+
retry "failover_service_endpoint"
281+
fi
282+
218283
# Condition: disconnecting -> remove oneself from endpoint if Active
219284
if echo "${STATUS}" | grep -i -q -e 'disconnecting'; then
220285
retry "remove_service_endpoint"
@@ -228,6 +293,9 @@ fi
228293

229294
# At this point mysql is started, query missing arguments
230295
mysql_probe_state
296+
if [ $? != 0 ]; then
297+
log_error "Could not probe missing mysql information. Aborting"
298+
fi
231299

232300
# Condition: first member of the primary partition -> set as Active endpoint
233301
if [ $PARTITION = "Primary" -a $SIZE -ge 0 -a "$INDEX" = "0" ]; then

0 commit comments

Comments
 (0)