Skip to content

Commit b0aa722

Browse files
committed
Make the liveliness periodic more robust
The previous SSH test was prone to sporadic failures with a failure rate above 10% for the amphora driver in vh-mecha-central. The HTTP test has been reliable so far in testing. The liveliness periodic invokes server.sh with `-u centos`, which both supplied the username to use when logging in via SSH and enabled the test. We no longer require a login username for the test so we add a new option `-l` which enables the test without taking an option. We retain the `-u` option for compatibility with the periodic. It can be removed after we have updated the periodic.
1 parent 5cb475d commit b0aa722

File tree

2 files changed

+72
-21
lines changed

2 files changed

+72
-21
lines changed

connectivity-test-cloud-init.yaml

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
#cloud-config
2+
3+
# Run a trivial http server which returns OK if it is able to connect to an
4+
# external web server.
5+
# N.B. This blocks, which means cloud-init will never complete. This is fine
6+
# for this CI test.
7+
runcmd:
8+
- - /usr/bin/python
9+
- -c
10+
- |
11+
from http.server import BaseHTTPRequestHandler
12+
from http.server import HTTPServer
13+
from urllib.request import urlopen
14+
15+
TEST_URL='https://www.google.com/'
16+
17+
class ConnectivityTest(BaseHTTPRequestHandler):
18+
def do_GET(self):
19+
try:
20+
urlopen(TEST_URL, timeout=5)
21+
self.send_response(200)
22+
self.end_headers()
23+
self.wfile.write('OK'.encode())
24+
except Exception as ex:
25+
self.send_response(500)
26+
self.end_headers()
27+
self.wfile.write(f'Unable to connect to {TEST_URL}: {ex}'.encode())
28+
29+
HTTPServer(('0.0.0.0', 80), ConnectivityTest).serve_forever()

server.sh

Lines changed: 43 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616

1717
set -Eeuo pipefail
1818

19+
script_dir=$(dirname "$(realpath "${BASH_SOURCE[0]}")")
20+
1921
print_help() {
2022
echo -e 'github.com/shiftstack/shiftstack-ci'
2123
echo -e 'Spin a server on OpenStack'
@@ -34,25 +36,28 @@ print_help() {
3436
echo -e '\t-d\tRun the script in debug mode'
3537
echo -e '\t-p\tDo not clean up the server after creation'
3638
echo -e '\t\t(will print a cleanup script instead of executing it).'
37-
echo -e '\t-u\tTest connectivity from the instance by setting the cloud user from the image (e.g. centos)'
39+
echo -e '\t-u\tName of the cloud user from the image (e.g. centos) [not used, except to imply -l]'
40+
echo -e '\t-l\tInstall and run a connectivity test application'
3841
echo -e '\t-t\tRun the script without pause (create/cleanup)'
3942
}
4043

4144
declare \
4245
debug='' \
4346
persistent='' \
4447
interactive='' \
45-
os_user='' \
48+
liveness='' \
4649
server_flavor='' \
4750
server_image='' \
4851
key_name='' \
4952
external_network='external'
50-
while getopts dtpf:u:i:e:k:h opt; do
53+
# Note that the $OPTARG to -u is ignored because it is deprecated
54+
while getopts dtplf:u:i:e:k:h opt; do
5155
case "$opt" in
5256
d) debug='yes' ;;
5357
p) persistent='yes' ;;
5458
t) interactive='no' ;;
55-
u) os_user="$OPTARG" ;;
59+
u) liveness='yes' ;;
60+
l) liveness='yes' ;;
5661
f) server_flavor="$OPTARG" ;;
5762
i) server_image="$OPTARG" ;;
5863
e) external_network="$OPTARG" ;;
@@ -203,14 +208,19 @@ port_id="$(openstack port create -f value -c id \
203208
"$name")"
204209
>&2 echo "Created port ${port_id}"
205210

206-
server_id="$(openstack server create --wait -f value -c id \
207-
--block-device uuid="$vol_id" \
208-
--image "$server_image" \
209-
--flavor "$server_flavor" \
210-
--nic "port-id=$port_id" \
211-
--security-group "$sg_id" \
212-
--key-name "$key_name" \
213-
"$name")"
211+
declare -a server_create_args
212+
server_create_args=(
213+
--block-device uuid="$vol_id"
214+
--image "$server_image"
215+
--flavor "$server_flavor"
216+
--nic "port-id=$port_id"
217+
--security-group "$sg_id"
218+
--key-name "$key_name"
219+
)
220+
if [ "$liveness" == 'yes' ]; then
221+
server_create_args+=(--user-data "${script_dir}/connectivity-test-cloud-init.yaml")
222+
fi
223+
server_id=$(openstack server create --wait -f value -c id "${server_create_args[@]}" "$name")
214224
# shellcheck disable=SC2086
215225
server_id="$(echo $server_id | tr -d '\r')"
216226
>&2 echo "Created server ${server_id}"
@@ -226,13 +236,13 @@ for driver in "${!drivers[@]}"; do
226236
>&2 echo "Created loadbalancer ${lb_id}"
227237
lb_ids+=("${lb_id}")
228238

229-
lb_listener_id="$(openstack loadbalancer listener create --wait --name "$name" -f value -c id --protocol TCP --protocol-port 22 "$lb_id")"
239+
lb_listener_id="$(openstack loadbalancer listener create --wait --name "$name" -f value -c id --protocol TCP --protocol-port 80 "$lb_id")"
230240
>&2 echo "Created loadbalancer listener ${lb_listener_id}"
231241

232242
lb_pool_id="$(openstack loadbalancer pool create --wait --name "$name" -f value -c id --lb-algorithm "${drivers[$driver]}" --listener "$lb_listener_id" --protocol TCP)"
233243
>&2 echo "Created loadbalancer pool ${lb_pool_id}"
234244

235-
lb_member_id="$(openstack loadbalancer member create --wait -f value -c id --subnet-id "$subnet_id" --address "$server_ip" --protocol-port 22 "$lb_pool_id")"
245+
lb_member_id="$(openstack loadbalancer member create --wait -f value -c id --subnet-id "$subnet_id" --address "$server_ip" --protocol-port 80 "$lb_pool_id")"
236246
>&2 echo "Created loadbalancer member ${lb_member_id}"
237247

238248
fip_id="$(openstack floating ip create -f value -c id \
@@ -244,17 +254,29 @@ for driver in "${!drivers[@]}"; do
244254
lb_vip_id="$(openstack port show -f value -c id "${ports[$driver]}"-"$lb_id")"
245255
openstack floating ip set --port "$lb_vip_id" "$fip_id"
246256

247-
if [ "$os_user" != '' ]; then
248-
echo "Testing connectivity from the instance ${name}"
249-
sleep 60
250-
if ! ssh -o ConnectTimeout=30 -o StrictHostKeyChecking=no "$os_user"@"$fip_address" ping -c 1 1.1.1.1; then
251-
echo "Error when running a ping from the instance. Dumping load balancer status..."
257+
if [ "$liveness" == 'yes' ]; then
258+
echo "Testing connectivity to and from the instance ${name}"
259+
260+
# N.B. We use a retry loop here rather than curl's retry
261+
# options here because it can catch more types of failure. e.g.
262+
# it can retry on 'No route to host' if the FIP hasn't
263+
# propagated to the network hardware yet, which curl cannot.
264+
start=$(date +%s)
265+
backoff=1
266+
while ! curl --fail-with-body http://"$fip_address"/; do
267+
# This normally succeeds immediately, but we allow it up to
268+
# 300 seconds.
269+
if [ $(( $(date +%s)-start )) -gt 300 ]; then
270+
echo "Error checking instance connectivity. Dumping load balancer status and console log."
252271
openstack loadbalancer status show "$lb_id"
253-
echo "Error when running a ping from the instance. Dumping instance console..."
254272
openstack console log show "$name" || true
255273
echo "Done"
256274
exit 1
257-
fi
275+
fi
276+
echo "Backing off for ${backoff} seconds"
277+
sleep ${backoff}
278+
backoff=$(( backoff * 2 ))
279+
done
258280
fi
259281
done
260282

0 commit comments

Comments
 (0)