Rework retry/timeout defaults to ensure fast service failover

dciabrin · dciabrin · commit 288f775c5993 · 2025-06-20T19:37:29.000+02:00
When the galera pod that receives database traffic becomes
unresponsible, the galera library reacts by running a script
in one of the surviving pod to elect a new endpoint. This
script uses curl to call the API server to update the selector
object responsible for balancing database traffic.

If during the API call the API server becomes unresponsive/unreacheable
(e.g. the API VIP fails over to another master node), the curl call
might get stuck for an unbounded period of time, which delays the
traffic failover and can cause a long database service disruption.

Add a default connect timeout and update default retry parameters
so that curl is never blocked for too long, and the endpoint
configuration can be retried until the API server becomes available.

This commit only improves the default parameters, the ability to override
those parameters will be addressed in a subsequent commit.

Jira: OSPRH-17604
diff --git a/templates/galera/bin/mysql_wsrep_notify.sh b/templates/galera/bin/mysql_wsrep_notify.sh
@@ -11,9 +11,14 @@ NAMESPACE=$(cat ${SERVICEACCOUNT}/namespace)
 TOKEN=$(cat ${SERVICEACCOUNT}/token)
 CACERT=${SERVICEACCOUNT}/ca.crt
 
-# Retry config
-RETRIES=6
-WAIT=1
+# OSPRH-17604: use default timeout and retry parameters for fast failover
+# default parameters for curl calls to the API server
+: ${WSREP_NOTIFY_CURL_CONNECT_TIMEOUT:=5}
+: ${WSREP_NOTIFY_CURL_MAX_TIME:=30}
+CURL="curl --connect-timeout ${WSREP_NOTIFY_CURL_CONNECT_TIMEOUT} --max-time ${WSREP_NOTIFY_CURL_MAX_TIME}"
+# defaults parameters for retry on error
+: ${WSREP_NOTIFY_RETRIES:=30}
+: ${WSREP_NOTIFY_RETRY_WAIT:=1}
 
 
 ##
@@ -66,7 +71,7 @@ function api_server {
         request="$request -d @-"
     fi
     local output
-    output=$(curl -s --cacert ${CACERT} --header "Content-Type:application/json" --header "Authorization: Bearer ${TOKEN}" --request $request ${APISERVER}/api/v1/namespaces/${NAMESPACE}/services/${service})
+    output=$(${CURL} -s --cacert ${CACERT} --header "Content-Type:application/json" --header "Authorization: Bearer ${TOKEN}" --request $request ${APISERVER}/api/v1/namespaces/${NAMESPACE}/services/${service})
 
     local rc=$?
     if [ $rc != 0 ]; then
@@ -109,8 +114,8 @@ function parse_output {
 # Generic retry logic for an action function
 function retry {
     local action=$1
-    local retries=$RETRIES
-    local wait=$WAIT
+    local retries=$WSREP_NOTIFY_RETRIES
+    local wait=$WSREP_NOTIFY_RETRY_WAIT
     local rc=1
 
     $action
@@ -132,7 +137,7 @@ function retry {
         mysql_probe_state reprobe
     done
     if [ $rc -ne 0 ]; then
-        log_error "Could not run action after ${RETRIES} tries. Stop retrying."
+        log_error "Could not run action after ${WSREP_NOTIFY_RETRIES} tries. Stop retrying."
     fi
     return $rc
 }