@@ -39,3 +39,45 @@ def test_prometheus_node_exporter_metrics(prom):
3939 """Check that expected node exporter metrics exist."""
4040 metrics = prom .all_metrics ()
4141 assert "node_cpu_seconds_total" in metrics
42+
43+
44+ def test_prometheus_alerts_inactive (prom ):
45+ """Check that no Prometheus alerts are active."""
46+ # https://prometheus.io/docs/prometheus/latest/querying/api/#alerts
47+ response = prom ._session .get (
48+ "{0}/api/v1/alerts" .format (prom .url ),
49+ verify = prom ._session .verify ,
50+ headers = prom .headers ,
51+ auth = prom .auth ,
52+ cert = prom ._session .cert ,
53+ )
54+ assert response .ok
55+ response = response .json ()
56+ assert "status" in response
57+ assert response ["status" ] == "success"
58+ assert "data" in response
59+ alerts = response ["data" ]["alerts" ] or []
60+
61+ # (MaxN) Allow for, and filter out, alerts we'd expect to see in an AIO environment.
62+ # TODO - find a way of configuring this for SCT running in other environments.
63+ aio_alerts_to_ignore = [
64+ # We know our volumes are small.
65+ { "alertname" : "StorageFillingUp" , "instance" : "controller0" },
66+ # This is probably due to storage space..
67+ { "alertname" : "ElasticsearchClusterYellow" , "instance" : "controller0" },
68+ # ..or because we're running in a single instance and it wants to be clustered across multiple nodes.
69+ { "alertname" : "ElasticsearchUnassignedShards" , "instance" : "controller0" },
70+ # It's a small AIO!
71+ { "alertname" : "LowMemory" , "instance" : "controller0" },
72+ # It's only one node and expects three, see https://github.com/stackhpc/stackhpc-kayobe-config/pull/1579
73+ { "alertname" : "RabbitMQNodeDown" },
74+ # This is probably because Tempest runs before pytest so the container has been recently stopped.
75+ { "alertname" : "ContainerKilled" , "name" : "tempest" }
76+ ]
77+
78+ def alert_is_ignored (alert , alerts_to_ignore ):
79+ # Check if any of the "ignore cases" match the alert
80+ return any (alert_to_ignore .items () <= alert .items () for alert_to_ignore in alerts_to_ignore )
81+
82+ alerts = [ alert for alert in alerts if not alert_is_ignored (alert ["labels" ], aio_alerts_to_ignore ) ]
83+ assert len (alerts ) == 0
0 commit comments