4848 enable_health_check_endpoint: True
4949 EOT
5050 # Add generated certificates to spec file
51- echo " ssl_cert: |" >> /tmp/mgmt.spec
51+ echo " ssl_cert: |" >> /tmp/mgmt.spec
5252 while read LINE; do echo $LINE | sed -e "s/^/ /"; done < /tmp/cert.pem >> /tmp/mgmt.spec
5353 echo " ssl_key: |" >> /tmp/mgmt.spec
5454 while read LINE; do echo $LINE | sed -e "s/^/ /"; done < /tmp/key.pem >> /tmp/mgmt.spec
@@ -60,18 +60,42 @@ tasks:
6060 host.a :
6161 - |
6262 set -ex
63+
64+ # Function to wait for a service to be healthy and log response on error
65+ wait_for_service() {
66+ local name="$1"
67+ local url="$2"
68+ local jq_filter="$3"
69+
70+ echo "Waiting for service $name to be healthy at $url..."
71+ for i in {1..30}; do
72+ local response
73+ response=$(curl -k -s -u admin:admin "$url")
74+ if echo "$response" | jq -e "$jq_filter" > /dev/null; then
75+ echo "Service $name is healthy."
76+ return 0
77+ fi
78+ echo "Attempt $i: service $name not ready yet"
79+ sleep 10
80+ done
81+
82+ echo "Timeout waiting for $name at $url"
83+ echo "Last HTTP response:"
84+ echo "$response"
85+ echo "jq output:"
86+ echo "$response" | jq "$jq_filter" || echo "(jq parse error or no match)"
87+ return 1
88+ }
89+
6390 # retrieve mgmt hostname and ip
6491 MGMT_GTW_HOST=$(ceph orch ps --daemon-type mgmt-gateway -f json | jq -e '.[]' | jq -r '.hostname')
6592 MGMT_GTW_IP=$(ceph orch host ls -f json | jq -r --arg MGMT_GTW_HOST "$MGMT_GTW_HOST" '.[] | select(.hostname==$MGMT_GTW_HOST) | .addr')
93+
6694 # check mgmt-gateway health
6795 curl -k -s https://${MGMT_GTW_IP}/health
6896 curl -k -s https://${MGMT_GTW_IP}:29443/health
69- # wait for background services to be reconfigured following mgmt-gateway installation
70- sleep 180
71- # check grafana endpoints are responsive and database health is okay
72- curl -k -s https://${MGMT_GTW_IP}/grafana/api/health | jq -e '.database == "ok"'
73- # check prometheus endpoints are responsive
74- curl -k -s -u admin:admin https://${MGMT_GTW_IP}/prometheus/api/v1/status/config | jq -e '.status == "success"'
75- # check alertmanager endpoints are responsive
76- curl -k -s -u admin:admin https://${MGMT_GTW_IP}/alertmanager/api/v2/status
7797
98+ # wait for monitoring services
99+ wait_for_service "Grafana" "https://${MGMT_GTW_IP}/grafana/api/health" '.database == "ok"' || exit 1
100+ wait_for_service "Prometheus" "https://${MGMT_GTW_IP}/prometheus/api/v1/status/config" '.status == "success"' || exit 1
101+ wait_for_service "Alertmanager" "https://${MGMT_GTW_IP}/alertmanager/api/v2/status" '.cluster.status == "ready"' || exit 1
0 commit comments