Skip to content

Commit 8ea6b9c

Browse files
authored
Merge pull request #5 from stackhpc/prometheus-alerts
Add Prometheus alerts test
2 parents 2d36542 + 7756b4f commit 8ea6b9c

File tree

1 file changed

+42
-0
lines changed

1 file changed

+42
-0
lines changed

stackhpc_cloud_tests/monitoring/test_prometheus.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,3 +39,45 @@ def test_prometheus_node_exporter_metrics(prom):
3939
"""Check that expected node exporter metrics exist."""
4040
metrics = prom.all_metrics()
4141
assert "node_cpu_seconds_total" in metrics
42+
43+
44+
def test_prometheus_alerts_inactive(prom):
45+
"""Check that no Prometheus alerts are active."""
46+
# https://prometheus.io/docs/prometheus/latest/querying/api/#alerts
47+
response = prom._session.get(
48+
"{0}/api/v1/alerts".format(prom.url),
49+
verify=prom._session.verify,
50+
headers=prom.headers,
51+
auth=prom.auth,
52+
cert=prom._session.cert,
53+
)
54+
assert response.ok
55+
response = response.json()
56+
assert "status" in response
57+
assert response["status"] == "success"
58+
assert "data" in response
59+
alerts = response["data"]["alerts"] or []
60+
61+
# (MaxN) Allow for, and filter out, alerts we'd expect to see in an AIO environment.
62+
# TODO - find a way of configuring this for SCT running in other environments.
63+
aio_alerts_to_ignore = [
64+
# We know our volumes are small.
65+
{ "alertname": "StorageFillingUp", "instance": "controller0" },
66+
# This is probably due to storage space..
67+
{ "alertname": "ElasticsearchClusterYellow", "instance": "controller0" },
68+
# ..or because we're running in a single instance and it wants to be clustered across multiple nodes.
69+
{ "alertname": "ElasticsearchUnassignedShards", "instance": "controller0" },
70+
# It's a small AIO!
71+
{ "alertname": "LowMemory", "instance": "controller0" },
72+
# It's only one node and expects three, see https://github.com/stackhpc/stackhpc-kayobe-config/pull/1579
73+
{ "alertname": "RabbitMQNodeDown" },
74+
# This is probably because Tempest runs before pytest so the container has been recently stopped.
75+
{ "alertname": "ContainerKilled", "name": "tempest" }
76+
]
77+
78+
def alert_is_ignored(alert, alerts_to_ignore):
79+
# Check if any of the "ignore cases" match the alert
80+
return any(alert_to_ignore.items() <= alert.items() for alert_to_ignore in alerts_to_ignore)
81+
82+
alerts = [ alert for alert in alerts if not alert_is_ignored(alert["labels"], aio_alerts_to_ignore) ]
83+
assert len(alerts) == 0

0 commit comments

Comments
 (0)