From c9be4e4b9aed9d24fedea17fda0e6df6a896fa01 Mon Sep 17 00:00:00 2001 From: Will Szumski Date: Fri, 1 Nov 2024 16:26:27 +0000 Subject: [PATCH] Redfish exporter: Decrease sensitivity of alert The redfish exporter is prone to failed scrapes. Lets wait for mulitple failed scrapes before triggering an alert. This should still catch the case where it is completely dead, but reduce the false positives from failed scrapes. --- etc/kayobe/kolla/config/prometheus/prometheus.rules | 11 ++++++++++- ...ity-of-redfish-target-alerts-a3d77a3f0c3dac8a.yaml | 6 ++++++ 2 files changed, 16 insertions(+), 1 deletion(-) create mode 100644 releasenotes/notes/reduces-sensitivity-of-redfish-target-alerts-a3d77a3f0c3dac8a.yaml diff --git a/etc/kayobe/kolla/config/prometheus/prometheus.rules b/etc/kayobe/kolla/config/prometheus/prometheus.rules index c9803946a..20e1b303a 100644 --- a/etc/kayobe/kolla/config/prometheus/prometheus.rules +++ b/etc/kayobe/kolla/config/prometheus/prometheus.rules @@ -7,7 +7,7 @@ groups: rules: - alert: PrometheusTargetMissing - expr: up == 0 + expr: up{job!="redfish-exporter-seed"} == 0 for: 5m labels: severity: critical @@ -15,6 +15,15 @@ groups: summary: "Prometheus target missing (instance {{ $labels.instance }})" description: "A Prometheus target has disappeared. An exporter might have crashed." + - alert: PrometheusTargetMissing + expr: up{job="redfish-exporter-seed"} == 0 + for: 15m + labels: + severity: critical + annotations: + summary: "Prometheus target missing (instance {{ $labels.instance }})" + description: "A Prometheus target has disappeared. An exporter might have crashed." + - alert: PrometheusAllTargetsMissing expr: count by (job) (up) == 0 for: 1m diff --git a/releasenotes/notes/reduces-sensitivity-of-redfish-target-alerts-a3d77a3f0c3dac8a.yaml b/releasenotes/notes/reduces-sensitivity-of-redfish-target-alerts-a3d77a3f0c3dac8a.yaml new file mode 100644 index 000000000..0ba59ea7a --- /dev/null +++ b/releasenotes/notes/reduces-sensitivity-of-redfish-target-alerts-a3d77a3f0c3dac8a.yaml @@ -0,0 +1,6 @@ +--- +fixes: + - | + Changes the duration for which redfish exporter must continually fail + scrapes before triggering an alert to 15 minutes. This should hopefully + reduce some alert spam.