Skip to content

Commit f63c86a

Browse files
dciabrinopenshift-cherrypick-robot
authored andcommitted
Improve tracking and error reporting of startup probe
Currently startup probe are scheduled with defaults from k8s (scheduled every 10s, failure threshold of 3). As galera joiner nodes can take a long time to start, this generates unecessary unhealthy events. Rework how the startup probe work by allowing a single, long probe which internally loops while probe the startup state. Throughout the startup process, keep track of the specific startup phase so in case the startup times out, the probe can log a precise error. Also rework how joiner nodes are tracked, to fail early in case galera cannot join a primary partition, to avoid the server being stuck until indefinitely until the startup probe times out. A subsequent commit will provide the ability to override probe settings and timeouts. Jira: OSPRH-11392
1 parent f248f6c commit f63c86a

File tree

5 files changed

+205
-16
lines changed

5 files changed

+205
-16
lines changed

pkg/mariadb/const.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,4 +6,7 @@ const (
66

77
// ActivePodSelectorKey - Selector key used to configure A/P service behavior
88
ActivePodSelectorKey = "statefulset.kubernetes.io/pod-name"
9+
10+
// Time allowed during a the startup probe (in seconds)
11+
StartupProbeTimeout = 240
912
)

pkg/mariadb/statefulset.go

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
package mariadb
22

33
import (
4+
"strconv"
5+
46
common "github.com/openstack-k8s-operators/lib-common/modules/common"
57
"github.com/openstack-k8s-operators/lib-common/modules/common/affinity"
68
mariadbv1 "github.com/openstack-k8s-operators/mariadb-operator/api/v1beta1"
@@ -112,6 +114,7 @@ func getGaleraInitContainers(g *mariadbv1.Galera) []corev1.Container {
112114
}
113115

114116
func getGaleraContainers(g *mariadbv1.Galera, configHash string) []corev1.Container {
117+
timeout := strconv.Itoa(StartupProbeTimeout)
115118
containers := []corev1.Container{{
116119
Image: g.Spec.ContainerImage,
117120
Name: "galera",
@@ -144,11 +147,13 @@ func getGaleraContainers(g *mariadbv1.Galera, configHash string) []corev1.Contai
144147
StartupProbe: &corev1.Probe{
145148
ProbeHandler: corev1.ProbeHandler{
146149
Exec: &corev1.ExecAction{
147-
Command: []string{"/bin/bash", "/var/lib/operator-scripts/mysql_probe.sh", "startup"},
150+
Command: []string{"/bin/bash", "/var/lib/operator-scripts/mysql_probe.sh", "startup", timeout},
148151
},
149152
},
150-
PeriodSeconds: 10,
151-
FailureThreshold: 30,
153+
// extra seconds so that the script is not preempted by k8s
154+
TimeoutSeconds: StartupProbeTimeout + 10,
155+
// the current probe implementation assumes a single failure threshold
156+
FailureThreshold: 1,
152157
},
153158
LivenessProbe: &corev1.Probe{
154159
ProbeHandler: corev1.ProbeHandler{

templates/galera/bin/mysql_probe.sh

Lines changed: 192 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -6,31 +6,212 @@ read -s -u 3 3< /var/lib/secrets/dbpassword MYSQL_PWD || true
66
export MYSQL_PWD
77

88
PROBE_USER=root
9-
function mysql_status_check {
9+
10+
MYSQL_SOCKET=/var/lib/mysql/mysql.sock
11+
SST_IN_PROGRESS=/var/lib/mysql/sst_in_progress
12+
13+
CHECK_RETRY=10
14+
CHECK_WAIT=0.5
15+
STARTUP_WAIT=2
16+
17+
LAST_STATE=""
18+
function log_state {
19+
local state="$1"
20+
# do not duplicate error logs in the probe, to minimize the
21+
# output in k8s events in case the probe fails
22+
if [ "${LAST_STATE}" != "${state}" ]; then
23+
LAST_STATE="${state}"
24+
fi
25+
}
26+
27+
function log_last_state {
28+
if [ -n "${LAST_STATE}" ]; then
29+
echo "${LAST_STATE}"
30+
fi
31+
}
32+
trap log_last_state EXIT
33+
34+
function get_mysql_status {
35+
local status=$1
36+
local i
37+
local out
38+
for i in $(seq $CHECK_RETRY); do
39+
out=$(mysql -u${PROBE_USER} -sNEe "show status like '${status}';" 2>&1)
40+
if [ $? -eq 0 ]; then
41+
echo "${out}" | tail -1
42+
return 0
43+
else
44+
sleep ${CHECK_WAIT}
45+
fi
46+
done
47+
# if we pass here, log the last error from mysql
48+
echo "${out}" >&2
49+
return 1
50+
}
51+
52+
function check_mysql_status {
1053
local status=$1
1154
local expect=$2
12-
set -x
13-
mysql -u${PROBE_USER} -sNEe "show status like '${status}';" | tail -1 | grep -w -e "${expect}"
55+
local val
56+
local rc
57+
58+
val=$(get_mysql_status "${status}")
59+
test "${val}" = "${expect}"
60+
rc=$?
61+
if [ $rc -ne 0 ]; then
62+
log_state "${status} (${val}) differs from ${expect}"
63+
fi
64+
return $rc
1465
}
1566

16-
# Consider the pod has "started" once mysql is reachable
17-
# and is part of the primary partition
18-
if [ "$1" = "startup" ]; then
19-
mysql_status_check wsrep_cluster_status Primary
20-
exit $?
21-
fi
67+
function check_sst_in_progress {
68+
local i
69+
# retry to give some time to mysql to set up the SST
70+
for i in $(seq $CHECK_RETRY); do
71+
if [ -e ${MYSQL_SOCKET} ]; then
72+
return 1
73+
elif [ -e ${SST_IN_PROGRESS} ]; then
74+
return 0
75+
else
76+
sleep ${CHECK_WAIT}
77+
fi
78+
done
79+
return 1
80+
}
2281

82+
function check_mysql_ready {
83+
local i
84+
# retry to give some time to mysql to create its socket
85+
for i in $(seq $CHECK_RETRY); do
86+
if [ -e ${MYSQL_SOCKET} ] && mysqladmin -s -u${PROBE_USER} ping >dev/null; then
87+
return 0
88+
else
89+
sleep ${CHECK_WAIT}
90+
fi
91+
done
92+
return 1
93+
}
94+
95+
# Monitor the startup sequence until the galera node is connected
96+
# to a primary component and synced
97+
# NOTE: as of mariadb 10.5, if mysql connects to a non-primary
98+
# partition, it never creates any socket and gets stuck indefinitely.
99+
# In that case, in order to not wait until the startup times out
100+
# (very long), we error out of the probe so that the pod can restart
101+
# and mysql reconnect to a primary partition if possible.
102+
function check_mysql_startup {
103+
# mysql initialization sequence:
104+
# . mysql connects to a remote galera node over port 4567
105+
# . mysql optionally runs a SST (port 4444), SST marker created on disk
106+
# . only at this point, InnoDB is initialized, mysql pidfile and
107+
# mysql socket are created on disk
108+
109+
if pgrep -f detect_gcomm_and_start.sh >/dev/null ; then
110+
log_state "waiting for gcomm URI"
111+
return 1
112+
fi
113+
# pidfile is not written on disk until mysql is ready,
114+
# so look for the mysqld process instead
115+
if ! pgrep -f /usr/libexec/mysqld >/dev/null ; then
116+
log_state "waiting for mysql to start"
117+
return 1
118+
fi
119+
120+
# a bootstrap node must be reachable from the CLI to finish startup
121+
if pgrep -f -- '--wsrep-cluster-address=gcomm://(\W|$)' >/dev/null; then
122+
check_mysql_ready
123+
return $?
124+
# a joiner node must have an established socket connection before testing further
125+
elif pgrep -f -- '--wsrep-cluster-address=gcomm://\w' >/dev/null; then
126+
local connections
127+
connections=$(ss -tnH state established src :4567 or dst :4567 | wc -l)
128+
if ! test "${connections}" -ge 0; then
129+
log_state "waiting for mysql to join a galera cluster"
130+
return 1
131+
fi
132+
else
133+
log_state "could not determine galera startup mode"
134+
exit 1
135+
fi
136+
137+
# a joiner node requires additional startup checks
138+
if [ -e /var/lib/mysql/mysql.sock ]; then
139+
# good case, mysql is ready to be probed from the CLI
140+
# check WSREP status like the regular liveness probe
141+
local status
142+
local comment
143+
status=$(get_mysql_status wsrep_cluster_status)
144+
comment=$(get_mysql_status wsrep_local_state_comment)
145+
if [ "${status}" = "Primary" -a "${comment}" = "Synced" ]; then
146+
return 0
147+
elif [ "${status}" = "Primary" ]; then
148+
log_state "waiting to be synced with the cluster"
149+
return 1
150+
elif [ "${status}" = "Non-primary" -a "${comment}" = "Synced"]; then
151+
log_state "mysql is connected to a non-primary partition, server stopped"
152+
exit 1
153+
else
154+
log_state "waiting for connection to a primary partition"
155+
return 1
156+
fi
157+
else
158+
# if there is no socket, mysql may be running an SST...
159+
if check_sst_in_progress; then
160+
log_state "waiting for SST to finish"
161+
return 1
162+
fi
163+
164+
# ... if no SST was detected, it may have finished before
165+
# we probed it. Check a last time whether we can connect to mysql
166+
if check_mysql_ready; then
167+
return 0
168+
fi
169+
170+
# At this stage, mysql is either trying to connect to a boostrap node
171+
# that resolved to an old pod IP, or it is is connected to a
172+
# non-primary partition. Either way, this is not recoverable, so
173+
# make the probe fail and let k8s kill the mysql server.
174+
175+
log_state "could not find a primary partition to connect to"
176+
exit 1
177+
fi
178+
return 1
179+
}
180+
181+
182+
# startup probe loops until the node started or joined a galera cluster
23183
# readiness and liveness probes are run by k8s only after start probe succeeded
24184

25185
case "$1" in
186+
startup)
187+
if [ -z "$2" ]; then
188+
echo "startup timeout option missing"
189+
exit 1
190+
fi
191+
TIME_TIMEOUT=$2
192+
193+
# Run the entire check in a single startup probe to avoid spurious
194+
# "Unhealthy" k8s events to be logged. The probe stops in error
195+
# if the startup timeout is reached
196+
rc=1
197+
while [ $rc -ne 0 ]; do
198+
if check_mysql_startup; then
199+
exit 0
200+
else
201+
sleep ${STARTUP_WAIT};
202+
[ $SECONDS -ge $TIME_TIMEOUT ] && exit 1
203+
fi
204+
done
205+
exit $rc
206+
;;
26207
readiness)
27208
# If the node is e.g. a donor, it cannot serve traffic
28-
mysql_status_check wsrep_local_state_comment Synced
209+
check_mysql_status wsrep_local_state_comment Synced
29210
;;
30211
liveness)
31212
# If the node is not in the primary partition, the failed liveness probe
32213
# will make k8s restart this pod
33-
mysql_status_check wsrep_cluster_status Primary
214+
check_mysql_status wsrep_cluster_status Primary
34215
;;
35216
*)
36217
echo "Invalid probe option '$1'"

templates/galera/config/galera.cnf.in

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ wsrep_debug = 0
4747
wsrep_drupal_282555_workaround = 0
4848
wsrep_on = ON
4949
wsrep_provider = /usr/lib64/galera/libgalera_smm.so
50-
wsrep_provider_options = gmcast.listen_addr=tcp://{ PODIP }:4567
50+
wsrep_provider_options = pc.wait_prim=FALSE;gcache.recover=no;gmcast.listen_addr=tcp://{ PODIP }:4567
5151
wsrep_retry_autocommit = 1
5252
wsrep_slave_threads = 1
5353
wsrep_sst_method = rsync

templates/galera/config/galera_tls.cnf.in

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ ssl-cert = /etc/pki/tls/certs/galera.crt
44
ssl-key = /etc/pki/tls/private/galera.key
55
ssl-ca = /etc/pki/ca-trust/extracted/pem/tls-ca-bundle.pem
66
ssl-cipher = !SSLv2:kEECDH:kRSA:kEDH:kPSK:+3DES:!aNULL:!eNULL:!MD5:!EXP:!RC4:!SEED:!IDEA:!DES:!SSLv3:!TLSv1
7-
wsrep_provider_options = gcache.recover=no;gmcast.listen_addr=tcp://{ PODIP }:4567;socket.ssl_key=/etc/pki/tls/private/galera.key;socket.ssl_cert=/etc/pki/tls/certs/galera.crt;socket.ssl_cipher={ SSL_CIPHER };socket.ssl_ca=/etc/pki/ca-trust/extracted/pem/tls-ca-bundle.pem;
7+
wsrep_provider_options = pc.wait_prim=FALSE;gcache.recover=no;gmcast.listen_addr=tcp://{ PODIP }:4567;socket.ssl_key=/etc/pki/tls/private/galera.key;socket.ssl_cert=/etc/pki/tls/certs/galera.crt;socket.ssl_cipher={ SSL_CIPHER };socket.ssl_ca=/etc/pki/ca-trust/extracted/pem/tls-ca-bundle.pem;
88

99
[sst]
1010
sockopt = cipher=!SSLv2:kEECDH:kRSA:kEDH:kPSK:+3DES:!aNULL:!eNULL:!MD5:!EXP:!RC4:!SEED:!IDEA:!DES:!SSLv3:!TLSv1

0 commit comments

Comments
 (0)