Merge pull request #302 from openshift-cherrypick-robot/cherry-pick-298-to-18.0-fr1

openshift-merge-bot[bot] · web-flow · commit ef204688aeaf · 2025-01-31T16:27:01.000Z
[18.0-fr1] Improve tracking and error reporting of startup probe
diff --git a/pkg/mariadb/const.go b/pkg/mariadb/const.go
@@ -6,4 +6,7 @@ const (
 
 	// ActivePodSelectorKey - Selector key used to configure A/P service behavior
 	ActivePodSelectorKey = "statefulset.kubernetes.io/pod-name"
+
+	// Time allowed during a the startup probe (in seconds)
+	StartupProbeTimeout = 240
 )
diff --git a/pkg/mariadb/statefulset.go b/pkg/mariadb/statefulset.go
@@ -1,6 +1,8 @@
 package mariadb
 
 import (
+	"strconv"
+
 	common "github.com/openstack-k8s-operators/lib-common/modules/common"
 	"github.com/openstack-k8s-operators/lib-common/modules/common/affinity"
 	mariadbv1 "github.com/openstack-k8s-operators/mariadb-operator/api/v1beta1"
@@ -112,6 +114,7 @@ func getGaleraInitContainers(g *mariadbv1.Galera) []corev1.Container {
 }
 
 func getGaleraContainers(g *mariadbv1.Galera, configHash string) []corev1.Container {
+	timeout := strconv.Itoa(StartupProbeTimeout)
 	containers := []corev1.Container{{
 		Image:   g.Spec.ContainerImage,
 		Name:    "galera",
@@ -144,11 +147,13 @@ func getGaleraContainers(g *mariadbv1.Galera, configHash string) []corev1.Contai
 		StartupProbe: &corev1.Probe{
 			ProbeHandler: corev1.ProbeHandler{
 				Exec: &corev1.ExecAction{
-					Command: []string{"/bin/bash", "/var/lib/operator-scripts/mysql_probe.sh", "startup"},
+					Command: []string{"/bin/bash", "/var/lib/operator-scripts/mysql_probe.sh", "startup", timeout},
 				},
 			},
-			PeriodSeconds:    10,
-			FailureThreshold: 30,
+			// extra seconds so that the script is not preempted by k8s
+			TimeoutSeconds: StartupProbeTimeout + 10,
+			// the current probe implementation assumes a single failure threshold
+			FailureThreshold: 1,
 		},
 		LivenessProbe: &corev1.Probe{
 			ProbeHandler: corev1.ProbeHandler{
diff --git a/templates/galera/bin/mysql_probe.sh b/templates/galera/bin/mysql_probe.sh
@@ -6,31 +6,212 @@ read -s -u 3 3< /var/lib/secrets/dbpassword MYSQL_PWD || true
 export MYSQL_PWD
 
 PROBE_USER=root
-function mysql_status_check {
+
+MYSQL_SOCKET=/var/lib/mysql/mysql.sock
+SST_IN_PROGRESS=/var/lib/mysql/sst_in_progress
+
+CHECK_RETRY=10
+CHECK_WAIT=0.5
+STARTUP_WAIT=2
+
+LAST_STATE=""
+function log_state {
+    local state="$1"
+    # do not duplicate error logs in the probe, to minimize the
+    # output in k8s events in case the probe fails
+    if [ "${LAST_STATE}" != "${state}" ]; then
+        LAST_STATE="${state}"
+    fi
+}
+
+function log_last_state {
+    if [ -n "${LAST_STATE}" ]; then
+        echo "${LAST_STATE}"
+    fi
+}
+trap log_last_state EXIT
+
+function get_mysql_status {
+    local status=$1
+    local i
+    local out
+    for i in $(seq $CHECK_RETRY); do
+        out=$(mysql -u${PROBE_USER} -sNEe "show status like '${status}';" 2>&1)
+        if [ $? -eq 0 ]; then
+            echo "${out}" | tail -1
+            return 0
+        else
+            sleep ${CHECK_WAIT}
+        fi
+    done
+    # if we pass here, log the last error from mysql
+    echo "${out}" >&2
+    return 1
+}
+
+function check_mysql_status {
     local status=$1
     local expect=$2
-    set -x
-    mysql -u${PROBE_USER} -sNEe "show status like '${status}';" | tail -1 | grep -w -e "${expect}"
+    local val
+    local rc
+
+    val=$(get_mysql_status "${status}")
+    test "${val}" = "${expect}"
+    rc=$?
+    if [ $rc -ne 0 ]; then
+        log_state "${status} (${val}) differs from ${expect}"
+    fi
+    return $rc
 }
 
-# Consider the pod has "started" once mysql is reachable
-# and is part of the primary partition
-if [ "$1" = "startup" ]; then
-    mysql_status_check wsrep_cluster_status Primary
-    exit $?
-fi
+function check_sst_in_progress {
+    local i
+    # retry to give some time to mysql to set up the SST
+    for i in $(seq $CHECK_RETRY); do
+        if [ -e ${MYSQL_SOCKET} ]; then
+            return 1
+        elif [ -e ${SST_IN_PROGRESS} ]; then
+            return 0
+        else
+            sleep ${CHECK_WAIT}
+        fi
+    done
+    return 1
+}
 
+function check_mysql_ready {
+    local i
+    # retry to give some time to mysql to create its socket
+    for i in $(seq $CHECK_RETRY); do
+        if [ -e ${MYSQL_SOCKET} ] && mysqladmin -s -u${PROBE_USER} ping >dev/null; then
+            return 0
+        else
+            sleep ${CHECK_WAIT}
+        fi
+    done
+    return 1
+}
+
+# Monitor the startup sequence until the galera node is connected
+# to a primary component and synced
+# NOTE: as of mariadb 10.5, if mysql connects to a non-primary
+# partition, it never creates any socket and gets stuck indefinitely.
+# In that case, in order to not wait until the startup times out
+# (very long), we error out of the probe so that the pod can restart
+# and mysql reconnect to a primary partition if possible.
+function check_mysql_startup {
+    # mysql initialization sequence:
+    #   . mysql connects to a remote galera node over port 4567
+    #   . mysql optionally runs a SST (port 4444), SST marker created on disk
+    #   . only at this point, InnoDB is initialized, mysql pidfile and
+    #     mysql socket are created on disk
+
+    if pgrep -f detect_gcomm_and_start.sh >/dev/null ; then
+        log_state "waiting for gcomm URI"
+        return 1
+    fi
+    # pidfile is not written on disk until mysql is ready,
+    # so look for the mysqld process instead
+    if ! pgrep -f /usr/libexec/mysqld >/dev/null ; then
+        log_state "waiting for mysql to start"
+        return 1
+    fi
+
+    # a bootstrap node must be reachable from the CLI to finish startup
+    if pgrep -f -- '--wsrep-cluster-address=gcomm://(\W|$)' >/dev/null; then
+        check_mysql_ready
+        return $?
+    # a joiner node must have an established socket connection before testing further
+    elif pgrep -f -- '--wsrep-cluster-address=gcomm://\w' >/dev/null; then
+        local connections
+        connections=$(ss -tnH state established src :4567 or dst :4567 | wc -l)
+        if ! test "${connections}" -ge 0; then
+            log_state "waiting for mysql to join a galera cluster"
+            return 1
+        fi
+    else
+        log_state "could not determine galera startup mode"
+        exit 1
+    fi
+
+    # a joiner node requires additional startup checks
+    if [ -e /var/lib/mysql/mysql.sock ]; then
+        # good case, mysql is ready to be probed from the CLI
+        # check WSREP status like the regular liveness probe
+        local status
+        local comment
+        status=$(get_mysql_status wsrep_cluster_status)
+        comment=$(get_mysql_status wsrep_local_state_comment)
+        if [ "${status}" = "Primary" -a "${comment}" = "Synced" ]; then
+            return 0
+        elif [ "${status}" = "Primary" ]; then
+            log_state "waiting to be synced with the cluster"
+            return 1
+        elif [ "${status}" = "Non-primary" -a "${comment}" = "Synced"]; then
+            log_state "mysql is connected to a non-primary partition, server stopped"
+            exit 1
+        else
+            log_state "waiting for connection to a primary partition"
+            return 1
+        fi
+    else
+        # if there is no socket, mysql may be running an SST...
+        if check_sst_in_progress; then
+            log_state "waiting for SST to finish"
+            return 1
+        fi
+
+        # ... if no SST was detected, it may have finished before
+        # we probed it. Check a last time whether we can connect to mysql
+        if check_mysql_ready; then
+            return 0
+        fi
+
+        # At this stage, mysql is either trying to connect to a boostrap node
+        # that resolved to an old pod IP, or it is is connected to a
+        # non-primary partition. Either way, this is not recoverable, so
+        # make the probe fail and let k8s kill the mysql server.
+
+        log_state "could not find a primary partition to connect to"
+        exit 1
+    fi
+    return 1
+}
+
+
+# startup probe loops until the node started or joined a galera cluster
 # readiness and liveness probes are run by k8s only after start probe succeeded
 
 case "$1" in
+    startup)
+        if [ -z "$2" ]; then
+            echo "startup timeout option missing"
+            exit 1
+        fi
+        TIME_TIMEOUT=$2
+
+        # Run the entire check in a single startup probe to avoid spurious
+        # "Unhealthy" k8s events to be logged. The probe stops in error
+        # if the startup timeout is reached
+        rc=1
+        while [ $rc -ne 0 ]; do
+            if check_mysql_startup; then
+                exit 0
+            else
+                sleep ${STARTUP_WAIT};
+                [ $SECONDS -ge $TIME_TIMEOUT ] && exit 1
+            fi
+        done
+        exit $rc
+        ;;
     readiness)
         # If the node is e.g. a donor, it cannot serve traffic
-        mysql_status_check wsrep_local_state_comment Synced
+        check_mysql_status wsrep_local_state_comment Synced
         ;;
     liveness)
         # If the node is not in the primary partition, the failed liveness probe
         # will make k8s restart this pod
-        mysql_status_check wsrep_cluster_status Primary
+        check_mysql_status wsrep_cluster_status Primary
         ;;
     *)
         echo "Invalid probe option '$1'"
diff --git a/templates/galera/config/galera.cnf.in b/templates/galera/config/galera.cnf.in
@@ -47,7 +47,7 @@ wsrep_debug = 0
 wsrep_drupal_282555_workaround = 0
 wsrep_on = ON
 wsrep_provider = /usr/lib64/galera/libgalera_smm.so
-wsrep_provider_options = gmcast.listen_addr=tcp://{ PODIP }:4567
+wsrep_provider_options = pc.wait_prim=FALSE;gcache.recover=no;gmcast.listen_addr=tcp://{ PODIP }:4567
 wsrep_retry_autocommit = 1
 wsrep_slave_threads = 1
 wsrep_sst_method = rsync
diff --git a/templates/galera/config/galera_tls.cnf.in b/templates/galera/config/galera_tls.cnf.in
@@ -4,7 +4,7 @@ ssl-cert = /etc/pki/tls/certs/galera.crt
 ssl-key = /etc/pki/tls/private/galera.key
 ssl-ca = /etc/pki/ca-trust/extracted/pem/tls-ca-bundle.pem
 ssl-cipher = !SSLv2:kEECDH:kRSA:kEDH:kPSK:+3DES:!aNULL:!eNULL:!MD5:!EXP:!RC4:!SEED:!IDEA:!DES:!SSLv3:!TLSv1
-wsrep_provider_options = gcache.recover=no;gmcast.listen_addr=tcp://{ PODIP }:4567;socket.ssl_key=/etc/pki/tls/private/galera.key;socket.ssl_cert=/etc/pki/tls/certs/galera.crt;socket.ssl_cipher={ SSL_CIPHER };socket.ssl_ca=/etc/pki/ca-trust/extracted/pem/tls-ca-bundle.pem;
+wsrep_provider_options = pc.wait_prim=FALSE;gcache.recover=no;gmcast.listen_addr=tcp://{ PODIP }:4567;socket.ssl_key=/etc/pki/tls/private/galera.key;socket.ssl_cert=/etc/pki/tls/certs/galera.crt;socket.ssl_cipher={ SSL_CIPHER };socket.ssl_ca=/etc/pki/ca-trust/extracted/pem/tls-ca-bundle.pem;
 
 [sst]
 sockopt = cipher=!SSLv2:kEECDH:kRSA:kEDH:kPSK:+3DES:!aNULL:!eNULL:!MD5:!EXP:!RC4:!SEED:!IDEA:!DES:!SSLv3:!TLSv1

Original file line number	Diff line number	Diff line change
`@@ -6,4 +6,7 @@ const (`
`6`	`6`
`7`	`7`	`// ActivePodSelectorKey - Selector key used to configure A/P service behavior`
`8`	`8`	`ActivePodSelectorKey = "statefulset.kubernetes.io/pod-name"`
	`9`	`+`
	`10`	`+ // Time allowed during a the startup probe (in seconds)`
	`11`	`+ StartupProbeTimeout = 240`
`9`	`12`	`)`