mgr/dashboard: fix indefinite loop in cephadm dashboard e2e

nizamial09 · nizamial09 · commit 17cc32377154 · 2024-09-12T11:41:37.000+05:30
the tests seems waiting to fetch the prometheus details incase the cephadm ran into error and it just waits there for more than an hour without any progress. fixing that and some minor improvements. an example log: https://jenkins.ceph.com/job/ceph-dashboard-cephadm-e2e/12287/consoleFull Signed-off-by: Nizamudeen A <nia@redhat.com>
diff --git a/src/pybind/mgr/dashboard/ci/cephadm/bootstrap-cluster.sh b/src/pybind/mgr/dashboard/ci/cephadm/bootstrap-cluster.sh
@@ -32,7 +32,7 @@ cephadm_shell="$CEPHADM shell --fsid ${fsid} -c /etc/ceph/ceph.conf -k /etc/ceph
 {% for number in range(1, nodes) %}
   ssh-copy-id -f -i /etc/ceph/ceph.pub  -o StrictHostKeyChecking=no root@192.168.100.10{{ number }}
   {% if expanded_cluster is defined %}
-    ${cephadm_shell} ceph orch host add {{ prefix }}-node-0{{ number }}
+    ${cephadm_shell} ceph orch host add {{ prefix }}-node-0{{ number }} 192.168.100.10{{ number }}
   {% endif %}
 {% endfor %}
 
diff --git a/src/pybind/mgr/dashboard/ci/cephadm/run-cephadm-e2e-tests.sh b/src/pybind/mgr/dashboard/ci/cephadm/run-cephadm-e2e-tests.sh
@@ -38,22 +38,4 @@ cypress_run () {
 
 cd ${CEPH_DEV_FOLDER}/src/pybind/mgr/dashboard/frontend
 
-kcli ssh -u root ceph-node-00 'cephadm shell "ceph config set mgr mgr/prometheus/exclude_perf_counters false"'
-
-# check if the prometheus daemon is running
-# before starting the e2e tests
-
-PROMETHEUS_RUNNING_COUNT=$(kcli ssh -u root ceph-node-00 'cephadm shell "ceph orch ls --service_name=prometheus --format=json"' | jq -r '.[] | .status.running')
-while [[ $PROMETHEUS_RUNNING_COUNT -lt 1 ]]; do
-    PROMETHEUS_RUNNING_COUNT=$(kcli ssh -u root ceph-node-00 'cephadm shell "ceph orch ls --service_name=prometheus --format=json"' | jq -r '.[] | .status.running')
-done
-
-# grafana ip address is set to the fqdn by default.
-# kcli is not working with that, so setting the IP manually.
-kcli ssh -u root ceph-node-00 'cephadm shell "ceph dashboard set-alertmanager-api-host http://192.168.100.100:9093"'
-kcli ssh -u root ceph-node-00 'cephadm shell "ceph dashboard set-prometheus-api-host http://192.168.100.100:9095"'
-kcli ssh -u root ceph-node-00 'cephadm shell "ceph dashboard set-grafana-api-url https://192.168.100.100:3000"'
-kcli ssh -u root ceph-node-00 'cephadm shell "ceph orch apply node-exporter --placement 'count:2'"'
-
-cypress_run ["cypress/e2e/orchestrator/workflow/*.feature","cypress/e2e/orchestrator/workflow/*-spec.ts"]
-cypress_run "cypress/e2e/orchestrator/grafana/*.feature"
+cypress_run ["cypress/e2e/orchestrator/workflow/*.feature","cypress/e2e/orchestrator/workflow/*-spec.ts","cypress/e2e/orchestrator/grafana/*.feature"]
diff --git a/src/pybind/mgr/dashboard/ci/cephadm/start-cluster.sh b/src/pybind/mgr/dashboard/ci/cephadm/start-cluster.sh
@@ -82,3 +82,37 @@ while [[ -z $(kcli ssh -u root -- ceph-node-00 'journalctl --no-tail --no-pager
     fi
     kcli ssh -u root -- ceph-node-00 'journalctl -n 100 --no-pager -t cloud-init'
 done
+
+kcli ssh -u root ceph-node-00 'cephadm shell "ceph config set mgr mgr/prometheus/exclude_perf_counters false"'
+
+get_prometheus_running_count() {
+    echo $(kcli ssh -u root ceph-node-00 'cephadm shell "ceph orch ls --service_name=prometheus --format=json"' | jq -r '.[] | .status.running')
+}
+
+# check if the prometheus daemon is running on jenkins node
+# before starting the e2e tests
+if [[ -n "${JENKINS_HOME}" ]]; then
+    retry=0
+    PROMETHEUS_RUNNING_COUNT=$(get_prometheus_running_count)
+    # retrying for 10 times to see if we can get the prometheus count
+    # otherwise this would run indefinitely and bloat up the machine
+    while [[ $retry -lt 10 && $PROMETHEUS_RUNNING_COUNT -lt 1 ]]; do
+        if [[ ${retry} -gt 0 ]]; then
+            echo "Retry attempt to get the prometheus count..." ${retry}
+        fi
+        PROMETHEUS_RUNNING_COUNT=$(get_prometheus_running_count)
+        retry=$((retry +1))
+        sleep 10
+    done
+
+    if [[ ${retry} -ge 10 ]]; then
+        exit 1
+    fi
+
+    # grafana ip address is set to the fqdn by default.
+    # kcli is not working with that, so setting the IP manually.
+    kcli ssh -u root ceph-node-00 'cephadm shell "ceph dashboard set-alertmanager-api-host http://192.168.100.100:9093"'
+    kcli ssh -u root ceph-node-00 'cephadm shell "ceph dashboard set-prometheus-api-host http://192.168.100.100:9095"'
+    kcli ssh -u root ceph-node-00 'cephadm shell "ceph dashboard set-grafana-api-url https://192.168.100.100:3000"'
+    kcli ssh -u root ceph-node-00 'cephadm shell "ceph orch apply node-exporter --placement 'count:2'"'
+fi