@@ -32,18 +32,26 @@ function log_error() {
3232function mysql_get_status {
3333 local name=$1
3434 mysql -nNE -uroot -p" ${DB_ROOT_PASSWORD} " -e " show status like '${name} ';" | tail -1
35- if [ $? != 0 ]; then
36- log_error " could not get value of mysql variable '${name} ' (rc=$? )"
37- return 1
38- fi
35+ local rc=$?
36+ [ $rc = 0 ] || log_error " could not get value of mysql variable '${name} ' (rc=$rc )"
37+ }
38+
39+ function mysql_get_members {
40+ mysql -nN -uroot -p" ${DB_ROOT_PASSWORD} " -e " select node_name from mysql.wsrep_cluster_members;"
41+ local rc=$?
42+ [ $rc = 0 ] || log_error " could not get cluster members from mysql' (rc=$rc )"
3943}
4044
41- # Refresh environment variables with the latest WSREP state from mysql
45+ # When optional script parameters are not provided, set up the environment
46+ # variables with the latest WSREP state retrieved from mysql
4247function mysql_probe_state {
43- UUID=$( mysql_get_status wsrep_gcomm_uuid)
44- PARTITION=$( mysql_get_status wsrep_cluster_status)
45- INDEX=$( mysql_get_status wsrep_local_index)
46- SIZE=$( mysql_get_status wsrep_cluster_size)
48+ [ " $1 " = " reprobe" ] && unset UUID PARTITION INDEX SIZE MEMBERS
49+ : ${UUID=$(mysql_get_status wsrep_gcomm_uuid)}
50+ : ${PARTITION=$(mysql_get_status wsrep_cluster_status)}
51+ : ${INDEX=$(mysql_get_status wsrep_local_index)}
52+ : ${SIZE=$(mysql_get_status wsrep_cluster_size)}
53+ : ${MEMBERS=$(mysql_get_members)}
54+ [ -n " ${UUID} " -a -n " ${PARTITION} " -a -n " ${INDEX} " -a -n " ${SIZE} " -a -n " ${MEMBERS} " ]
4755}
4856
4957# REST API call to the k8s API server
@@ -83,12 +91,10 @@ function api_server {
8391# Update the service's active endpoint
8492# (parse JSON with python3 as we don't have jq in the container image)
8593function service_endpoint {
86- local endpoint=$1
87- if [ -n " ${endpoint} " ]; then
88- python3 -c ' import json,sys;s=json.load(sys.stdin);s["spec"]["selector"]["statefulset.kubernetes.io/pod-name"]="' ${endpoint} ' ";print(json.dumps(s,indent=2))'
89- else
90- python3 -c ' import json,sys;s=json.load(sys.stdin);s["spec"]["selector"].pop("statefulset.kubernetes.io/pod-name", None);print(json.dumps(s,indent=2))'
91- fi
94+ local endpoint=" $1 "
95+ # note: empty endpoint means "block incoming traffic", so the selector must still
96+ # be present, otherwise k8s would balance incoming traffic to _any_ available pod.
97+ python3 -c ' import json,sys;s=json.load(sys.stdin);s["spec"]["selector"]["statefulset.kubernetes.io/pod-name"]="' ${endpoint} ' ";print(json.dumps(s,indent=2))'
9298 [ $? == 0 ] || log_error " Could not parse json endpoint (rc=$? )"
9399}
94100
@@ -123,7 +129,7 @@ function retry {
123129 retries=$(( retries - 1 ))
124130 # reprobe mysql state now, as if the cluster state changed since
125131 # the start of this script, we might not need to retry the action
126- mysql_probe_state
132+ mysql_probe_state reprobe
127133 done
128134 if [ $rc -ne 0 ]; then
129135 log_error " Could not run action after ${RETRIES} tries. Stop retrying."
@@ -149,6 +155,11 @@ function reconfigure_service_endpoint {
149155
150156 CURRENT_ENDPOINT=$( echo " $CURRENT_SVC " | parse_output ' ["spec"]["selector"].get("statefulset.kubernetes.io/pod-name","")' )
151157 [ $? == 0 ] || return 1
158+ # do not reconfigure endpoint if unecessary, to avoid client disconnections
159+ if [ -n " ${CURRENT_ENDPOINT} " ] && echo " $MEMBERS " | grep -q " ^${CURRENT_ENDPOINT} \$ " ; then
160+ log " Active endpoint ${CURRENT_ENDPOINT} is still part of the primary partition. Nothing to be done."
161+ return 0
162+ fi
152163 if [ " ${CURRENT_ENDPOINT} " == " ${PODNAME} " ]; then
153164 log " Node ${PODNAME} is currently the active endpoint for service ${SERVICE} . Nothing to be done."
154165 return 0
@@ -164,6 +175,39 @@ function reconfigure_service_endpoint {
164175 return 0
165176}
166177
178+ # # Failover to another node if we are the current Active endpoint
179+ function failover_service_endpoint {
180+ if [ $PARTITION != " Primary" ]; then
181+ log " Node ${PODNAME} is not the Primary partion. Nothing to be done."
182+ return 0
183+ fi
184+
185+ CURRENT_SVC=$( api_server GET " $SERVICE " )
186+ local rc=$?
187+ [ $rc == 0 ] || return $rc
188+
189+ CURRENT_ENDPOINT=$( echo " $CURRENT_SVC " | parse_output ' ["spec"]["selector"].get("statefulset.kubernetes.io/pod-name","")' )
190+ [ $? == 0 ] || return 1
191+ if [ " ${CURRENT_ENDPOINT} " != " ${PODNAME} " ]; then
192+ log " Node ${PODNAME} is not the active endpoint. Nothing to be done."
193+ return 0
194+ fi
195+ # select the first available node in the primary partition to be the failover endpoint
196+ NEW_ENDPOINT=$( echo " $MEMBERS " | grep -v " ${PODNAME} " | head -1)
197+ if [ -z " ${NEW_ENDPOINT} " ]; then
198+ log " No other available node to become the active endpoint."
199+ fi
200+
201+ NEW_SVC=$( echo " $CURRENT_SVC " | service_endpoint " $NEW_ENDPOINT " )
202+ [ $? == 0 ] || return 1
203+
204+ log " Configuring a new active endpoint for service ${SERVICE} : '${CURRENT_ENDPOINT} ' -> '${NEW_ENDPOINT} '"
205+ UPDATE_RESULT=$( echo " $NEW_SVC " | api_server PUT " $SERVICE " )
206+ [ $? == 0 ] || return 1
207+
208+ return 0
209+ }
210+
167211# # Change the Active endpoint from the service
168212function remove_service_endpoint {
169213 CURRENT_SVC=$( api_server GET " $SERVICE " )
@@ -194,17 +238,29 @@ function remove_service_endpoint {
194238log " called with args: $* "
195239
196240# Galera always calls script with --status argument
197- # All other arguments (uuid,partition,index...) are optional,
198- # so get those values by probing mysql directly
199- STATUS=" "
200- PARTITION=" "
201- INDEX=" "
241+ # All other optional arguments (uuid,partition,index...):
242+ # UUID: cluster's current UUID
243+ # MEMBERS: galera node connected to the cluster
244+ # SIZE: number of nodes in the cluster
245+ # INDEX: member index in the cluster
246+ # PARTITION: cluster partition we're in (Primary, Non-primary)
202247while [ $# -gt 0 ]; do
203248 case $1 in
204249 --status)
205250 STATUS=$2
206251 shift ;;
207- --uuid|--members|--primary|--index)
252+ --members)
253+ MEMBERS=$( echo " $2 " | tr ' ,' ' \n' | cut -d/ -f2)
254+ SIZE=$( echo " $MEMBERS " | wc -l)
255+ shift ;;
256+ --primary)
257+ [ " $2 " = " yes" ] && PARTITION=" Primary"
258+ [ " $2 " = " no" ] && PARTITION=" Non-primary"
259+ shift ;;
260+ --index)
261+ INDEX=$2
262+ shift ;;
263+ --uuid)
208264 shift ;;
209265 esac
210266 shift
@@ -215,6 +271,15 @@ if [ -z "${STATUS}" ]; then
215271 exit 1
216272fi
217273
274+ # Contition: ask for a failover. This should be called when mysql is running
275+ if echo " ${STATUS} " | grep -i -q -e ' failover' ; then
276+ mysql_probe_state
277+ if [ $? != 0 ]; then
278+ log_error " Could not probe missing mysql information. Aborting"
279+ fi
280+ retry " failover_service_endpoint"
281+ fi
282+
218283# Condition: disconnecting -> remove oneself from endpoint if Active
219284if echo " ${STATUS} " | grep -i -q -e ' disconnecting' ; then
220285 retry " remove_service_endpoint"
228293
229294# At this point mysql is started, query missing arguments
230295mysql_probe_state
296+ if [ $? != 0 ]; then
297+ log_error " Could not probe missing mysql information. Aborting"
298+ fi
231299
232300# Condition: first member of the primary partition -> set as Active endpoint
233301if [ $PARTITION = " Primary" -a $SIZE -ge 0 -a " $INDEX " = " 0" ]; then
0 commit comments