Skip to content

Commit 6849754

Browse files
Remove prometheus alerts (#298)
Signed-off-by: ArthurSens <[email protected]> Signed-off-by: ArthurSens <[email protected]> Co-authored-by: Aleksandar Aleksandrov <[email protected]>
1 parent fd04e3a commit 6849754

File tree

2 files changed

+25
-264
lines changed

2 files changed

+25
-264
lines changed

lib/alert-filter.libsonnet

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,30 @@ local unwatedAlerts = [
7777
'ConfigReloaderSidecarErrors', // Re-added to platform-mixin
7878
'PrometheusOperatorRejectedResources',
7979
'PrometheusOperatorSyncFailed',
80+
81+
// From Prometheus
82+
'PrometheusBadConfig', // Re-added to platform-mixin
83+
'PrometheusNotificationQueueRunningFull',
84+
'PrometheusErrorSendingAlertsToSomeAlertmanagers',
85+
'PrometheusNotConnectedToAlertmanagers',
86+
'PrometheusTSDBReloadsFailing',
87+
'PrometheusTSDBCompactionsFailing',
88+
'PrometheusNotIngestingSamples',
89+
'PrometheusDuplicateTimestamps',
90+
'PrometheusOutOfOrderTimestamps',
91+
'PrometheusRemoteStorageFailures', // Re-added to platform-mixin
92+
'PrometheusRemoteWriteBehind',
93+
'PrometheusRemoteWriteDesiredShards',
94+
'PrometheusRuleFailures', // Re-added to platform-mixin
95+
'PrometheusMissingRuleEvaluations',
96+
'PrometheusTargetLimitHit',
97+
'PrometheusLabelLimitHit',
98+
'PrometheusScrapeBodySizeLimitHit',
99+
'PrometheusScrapeSampleLimitHit',
100+
'PrometheusTargetSyncFailure',
101+
'PrometheusHighQueryLoad',
102+
'PrometheusErrorSendingAlertsToAnyAlertmanager',
103+
80104
];
81105

82106
{

monitoring-satellite/manifests/kube-prometheus-rules/rules.yaml

Lines changed: 1 addition & 264 deletions
Original file line numberDiff line numberDiff line change
@@ -1183,270 +1183,7 @@ spec:
11831183
)
11841184
record: instance:node_network_transmit_drop_excluding_lo:rate5m
11851185
- name: prometheus
1186-
rules:
1187-
- alert: PrometheusBadConfig
1188-
annotations:
1189-
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to reload its configuration.
1190-
runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/PrometheusBadConfig.md
1191-
summary: Failed Prometheus configuration reload.
1192-
expr: |
1193-
# Without max_over_time, failed scrapes could create false negatives, see
1194-
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
1195-
max_over_time(prometheus_config_last_reload_successful{job="prometheus-k8s",namespace="monitoring-satellite"}[5m]) == 0
1196-
for: 10m
1197-
labels:
1198-
severity: critical
1199-
- alert: PrometheusNotificationQueueRunningFull
1200-
annotations:
1201-
description: Alert notification queue of Prometheus {{$labels.namespace}}/{{$labels.pod}} is running full.
1202-
runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/PrometheusNotificationQueueRunningFull.md
1203-
summary: Prometheus alert notification queue predicted to run full in less than 30m.
1204-
expr: |
1205-
# Without min_over_time, failed scrapes could create false negatives, see
1206-
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
1207-
(
1208-
predict_linear(prometheus_notifications_queue_length{job="prometheus-k8s",namespace="monitoring-satellite"}[5m], 60 * 30)
1209-
>
1210-
min_over_time(prometheus_notifications_queue_capacity{job="prometheus-k8s",namespace="monitoring-satellite"}[5m])
1211-
)
1212-
for: 15m
1213-
labels:
1214-
severity: warning
1215-
- alert: PrometheusErrorSendingAlertsToSomeAlertmanagers
1216-
annotations:
1217-
description: '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}}.'
1218-
runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/PrometheusErrorSendingAlertsToSomeAlertmanagers.md
1219-
summary: Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager.
1220-
expr: |
1221-
(
1222-
rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="monitoring-satellite"}[5m])
1223-
/
1224-
rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="monitoring-satellite"}[5m])
1225-
)
1226-
* 100
1227-
> 1
1228-
for: 15m
1229-
labels:
1230-
severity: warning
1231-
- alert: PrometheusNotConnectedToAlertmanagers
1232-
annotations:
1233-
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not connected to any Alertmanagers.
1234-
runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/PrometheusNotConnectedToAlertmanagers.md
1235-
summary: Prometheus is not connected to any Alertmanagers.
1236-
expr: |
1237-
# Without max_over_time, failed scrapes could create false negatives, see
1238-
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
1239-
max_over_time(prometheus_notifications_alertmanagers_discovered{job="prometheus-k8s",namespace="monitoring-satellite"}[5m]) < 1
1240-
for: 10m
1241-
labels:
1242-
severity: warning
1243-
- alert: PrometheusTSDBReloadsFailing
1244-
annotations:
1245-
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} reload failures over the last 3h.
1246-
runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/PrometheusTSDBReloadsFailing.md
1247-
summary: Prometheus has issues reloading blocks from disk.
1248-
expr: |
1249-
increase(prometheus_tsdb_reloads_failures_total{job="prometheus-k8s",namespace="monitoring-satellite"}[3h]) > 0
1250-
for: 4h
1251-
labels:
1252-
severity: warning
1253-
- alert: PrometheusTSDBCompactionsFailing
1254-
annotations:
1255-
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} compaction failures over the last 3h.
1256-
runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/PrometheusTSDBCompactionsFailing.md
1257-
summary: Prometheus has issues compacting blocks.
1258-
expr: |
1259-
increase(prometheus_tsdb_compactions_failed_total{job="prometheus-k8s",namespace="monitoring-satellite"}[3h]) > 0
1260-
for: 4h
1261-
labels:
1262-
severity: warning
1263-
- alert: PrometheusNotIngestingSamples
1264-
annotations:
1265-
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not ingesting samples.
1266-
runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/PrometheusNotIngestingSamples.md
1267-
summary: Prometheus is not ingesting samples.
1268-
expr: |
1269-
(
1270-
rate(prometheus_tsdb_head_samples_appended_total{job="prometheus-k8s",namespace="monitoring-satellite"}[5m]) <= 0
1271-
and
1272-
(
1273-
sum without(scrape_job) (prometheus_target_metadata_cache_entries{job="prometheus-k8s",namespace="monitoring-satellite"}) > 0
1274-
or
1275-
sum without(rule_group) (prometheus_rule_group_rules{job="prometheus-k8s",namespace="monitoring-satellite"}) > 0
1276-
)
1277-
)
1278-
for: 10m
1279-
labels:
1280-
severity: warning
1281-
- alert: PrometheusDuplicateTimestamps
1282-
annotations:
1283-
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{ printf "%.4g" $value }} samples/s with different values but duplicated timestamp.
1284-
runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/PrometheusDuplicateTimestamps.md
1285-
summary: Prometheus is dropping samples with duplicate timestamps.
1286-
expr: |
1287-
rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-k8s",namespace="monitoring-satellite"}[5m]) > 0
1288-
for: 10m
1289-
labels:
1290-
severity: warning
1291-
- alert: PrometheusOutOfOrderTimestamps
1292-
annotations:
1293-
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{ printf "%.4g" $value }} samples/s with timestamps arriving out of order.
1294-
runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/PrometheusOutOfOrderTimestamps.md
1295-
summary: Prometheus drops samples with out-of-order timestamps.
1296-
expr: |
1297-
rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus-k8s",namespace="monitoring-satellite"}[5m]) > 0
1298-
for: 10m
1299-
labels:
1300-
severity: warning
1301-
- alert: PrometheusRemoteStorageFailures
1302-
annotations:
1303-
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} failed to send {{ printf "%.1f" $value }}% of the samples to {{ $labels.remote_name}}:{{ $labels.url }}
1304-
runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/PrometheusRemoteStorageFailures.md
1305-
summary: Prometheus fails to send samples to remote storage.
1306-
expr: |
1307-
(
1308-
(rate(prometheus_remote_storage_failed_samples_total{job="prometheus-k8s",namespace="monitoring-satellite"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="prometheus-k8s",namespace="monitoring-satellite"}[5m]))
1309-
/
1310-
(
1311-
(rate(prometheus_remote_storage_failed_samples_total{job="prometheus-k8s",namespace="monitoring-satellite"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="prometheus-k8s",namespace="monitoring-satellite"}[5m]))
1312-
+
1313-
(rate(prometheus_remote_storage_succeeded_samples_total{job="prometheus-k8s",namespace="monitoring-satellite"}[5m]) or rate(prometheus_remote_storage_samples_total{job="prometheus-k8s",namespace="monitoring-satellite"}[5m]))
1314-
)
1315-
)
1316-
* 100
1317-
> 1
1318-
for: 15m
1319-
labels:
1320-
severity: critical
1321-
- alert: PrometheusRemoteWriteBehind
1322-
annotations:
1323-
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write is {{ printf "%.1f" $value }}s behind for {{ $labels.remote_name}}:{{ $labels.url }}.
1324-
runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/PrometheusRemoteWriteBehind.md
1325-
summary: Prometheus remote write is behind.
1326-
expr: |
1327-
# Without max_over_time, failed scrapes could create false negatives, see
1328-
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
1329-
(
1330-
max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job="prometheus-k8s",namespace="monitoring-satellite"}[5m])
1331-
- ignoring(remote_name, url) group_right
1332-
max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job="prometheus-k8s",namespace="monitoring-satellite"}[5m])
1333-
)
1334-
> 120
1335-
for: 15m
1336-
labels:
1337-
severity: critical
1338-
- alert: PrometheusRemoteWriteDesiredShards
1339-
annotations:
1340-
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write desired shards calculation wants to run {{ $value }} shards for queue {{ $labels.remote_name}}:{{ $labels.url }}, which is more than the max of {{ printf `prometheus_remote_storage_shards_max{instance="%s",job="prometheus-k8s",namespace="monitoring-satellite"}` $labels.instance | query | first | value }}.
1341-
runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/PrometheusRemoteWriteDesiredShards.md
1342-
summary: Prometheus remote write desired shards calculation wants to run more than configured max shards.
1343-
expr: |
1344-
# Without max_over_time, failed scrapes could create false negatives, see
1345-
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
1346-
(
1347-
max_over_time(prometheus_remote_storage_shards_desired{job="prometheus-k8s",namespace="monitoring-satellite"}[5m])
1348-
>
1349-
max_over_time(prometheus_remote_storage_shards_max{job="prometheus-k8s",namespace="monitoring-satellite"}[5m])
1350-
)
1351-
for: 15m
1352-
labels:
1353-
severity: warning
1354-
- alert: PrometheusRuleFailures
1355-
annotations:
1356-
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to evaluate {{ printf "%.0f" $value }} rules in the last 5m.
1357-
runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/PrometheusRuleFailures.md
1358-
summary: Prometheus is failing rule evaluations.
1359-
expr: |
1360-
increase(prometheus_rule_evaluation_failures_total{job="prometheus-k8s",namespace="monitoring-satellite"}[5m]) > 0
1361-
for: 15m
1362-
labels:
1363-
severity: warning
1364-
- alert: PrometheusMissingRuleEvaluations
1365-
annotations:
1366-
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has missed {{ printf "%.0f" $value }} rule group evaluations in the last 5m.
1367-
runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/PrometheusMissingRuleEvaluations.md
1368-
summary: Prometheus is missing rule evaluations due to slow rule group evaluation.
1369-
expr: |
1370-
increase(prometheus_rule_group_iterations_missed_total{job="prometheus-k8s",namespace="monitoring-satellite"}[5m]) > 0
1371-
for: 15m
1372-
labels:
1373-
severity: warning
1374-
- alert: PrometheusTargetLimitHit
1375-
annotations:
1376-
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has dropped {{ printf "%.0f" $value }} targets because the number of targets exceeded the configured target_limit.
1377-
runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/PrometheusTargetLimitHit.md
1378-
summary: Prometheus has dropped targets because some scrape configs have exceeded the targets limit.
1379-
expr: |
1380-
increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job="prometheus-k8s",namespace="monitoring-satellite"}[5m]) > 0
1381-
for: 15m
1382-
labels:
1383-
severity: warning
1384-
- alert: PrometheusLabelLimitHit
1385-
annotations:
1386-
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has dropped {{ printf "%.0f" $value }} targets because some samples exceeded the configured label_limit, label_name_length_limit or label_value_length_limit.
1387-
runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/PrometheusLabelLimitHit.md
1388-
summary: Prometheus has dropped targets because some scrape configs have exceeded the labels limit.
1389-
expr: |
1390-
increase(prometheus_target_scrape_pool_exceeded_label_limits_total{job="prometheus-k8s",namespace="monitoring-satellite"}[5m]) > 0
1391-
for: 15m
1392-
labels:
1393-
severity: warning
1394-
- alert: PrometheusScrapeBodySizeLimitHit
1395-
annotations:
1396-
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed {{ printf "%.0f" $value }} scrapes in the last 5m because some targets exceeded the configured body_size_limit.
1397-
runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/PrometheusScrapeBodySizeLimitHit.md
1398-
summary: Prometheus has dropped some targets that exceeded body size limit.
1399-
expr: |
1400-
increase(prometheus_target_scrapes_exceeded_body_size_limit_total{job="prometheus-k8s",namespace="monitoring-satellite"}[5m]) > 0
1401-
for: 15m
1402-
labels:
1403-
severity: warning
1404-
- alert: PrometheusScrapeSampleLimitHit
1405-
annotations:
1406-
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed {{ printf "%.0f" $value }} scrapes in the last 5m because some targets exceeded the configured sample_limit.
1407-
runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/PrometheusScrapeSampleLimitHit.md
1408-
summary: Prometheus has failed scrapes that have exceeded the configured sample limit.
1409-
expr: |
1410-
increase(prometheus_target_scrapes_exceeded_sample_limit_total{job="prometheus-k8s",namespace="monitoring-satellite"}[5m]) > 0
1411-
for: 15m
1412-
labels:
1413-
severity: warning
1414-
- alert: PrometheusTargetSyncFailure
1415-
annotations:
1416-
description: '{{ printf "%.0f" $value }} targets in Prometheus {{$labels.namespace}}/{{$labels.pod}} have failed to sync because invalid configuration was supplied.'
1417-
runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/PrometheusTargetSyncFailure.md
1418-
summary: Prometheus has failed to sync targets.
1419-
expr: |
1420-
increase(prometheus_target_sync_failed_total{job="prometheus-k8s",namespace="monitoring-satellite"}[30m]) > 0
1421-
for: 5m
1422-
labels:
1423-
severity: critical
1424-
- alert: PrometheusHighQueryLoad
1425-
annotations:
1426-
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} query API has less than 20% available capacity in its query engine for the last 15 minutes.
1427-
runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/PrometheusHighQueryLoad.md
1428-
summary: Prometheus is reaching its maximum capacity serving concurrent requests.
1429-
expr: |
1430-
avg_over_time(prometheus_engine_queries{job="prometheus-k8s",namespace="monitoring-satellite"}[5m]) / max_over_time(prometheus_engine_queries_concurrent_max{job="prometheus-k8s",namespace="monitoring-satellite"}[5m]) > 0.8
1431-
for: 15m
1432-
labels:
1433-
severity: warning
1434-
- alert: PrometheusErrorSendingAlertsToAnyAlertmanager
1435-
annotations:
1436-
description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.'
1437-
runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/PrometheusErrorSendingAlertsToAnyAlertmanager.md
1438-
summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
1439-
expr: |
1440-
min without (alertmanager) (
1441-
rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="monitoring-satellite",alertmanager!~``}[5m])
1442-
/
1443-
rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="monitoring-satellite",alertmanager!~``}[5m])
1444-
)
1445-
* 100
1446-
> 3
1447-
for: 15m
1448-
labels:
1449-
severity: critical
1186+
rules: []
14501187
- name: prometheus-operator
14511188
rules: []
14521189
- name: config-reloaders

0 commit comments

Comments
 (0)