@@ -1183,270 +1183,7 @@ spec:
1183
1183
)
1184
1184
record: instance:node_network_transmit_drop_excluding_lo:rate5m
1185
1185
- name : prometheus
1186
- rules :
1187
- - alert : PrometheusBadConfig
1188
- annotations :
1189
- description : Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to reload its configuration.
1190
- runbook_url : https://github.com/gitpod-io/runbooks/blob/main/runbooks/PrometheusBadConfig.md
1191
- summary : Failed Prometheus configuration reload.
1192
- expr : |
1193
- # Without max_over_time, failed scrapes could create false negatives, see
1194
- # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
1195
- max_over_time(prometheus_config_last_reload_successful{job="prometheus-k8s",namespace="monitoring-satellite"}[5m]) == 0
1196
- for : 10m
1197
- labels :
1198
- severity : critical
1199
- - alert : PrometheusNotificationQueueRunningFull
1200
- annotations :
1201
- description : Alert notification queue of Prometheus {{$labels.namespace}}/{{$labels.pod}} is running full.
1202
- runbook_url : https://github.com/gitpod-io/runbooks/blob/main/runbooks/PrometheusNotificationQueueRunningFull.md
1203
- summary : Prometheus alert notification queue predicted to run full in less than 30m.
1204
- expr : |
1205
- # Without min_over_time, failed scrapes could create false negatives, see
1206
- # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
1207
- (
1208
- predict_linear(prometheus_notifications_queue_length{job="prometheus-k8s",namespace="monitoring-satellite"}[5m], 60 * 30)
1209
- >
1210
- min_over_time(prometheus_notifications_queue_capacity{job="prometheus-k8s",namespace="monitoring-satellite"}[5m])
1211
- )
1212
- for : 15m
1213
- labels :
1214
- severity : warning
1215
- - alert : PrometheusErrorSendingAlertsToSomeAlertmanagers
1216
- annotations :
1217
- description : ' {{ printf "%.1f" $value }}% errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}}.'
1218
- runbook_url : https://github.com/gitpod-io/runbooks/blob/main/runbooks/PrometheusErrorSendingAlertsToSomeAlertmanagers.md
1219
- summary : Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager.
1220
- expr : |
1221
- (
1222
- rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="monitoring-satellite"}[5m])
1223
- /
1224
- rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="monitoring-satellite"}[5m])
1225
- )
1226
- * 100
1227
- > 1
1228
- for : 15m
1229
- labels :
1230
- severity : warning
1231
- - alert : PrometheusNotConnectedToAlertmanagers
1232
- annotations :
1233
- description : Prometheus {{$labels.namespace}}/{{$labels.pod}} is not connected to any Alertmanagers.
1234
- runbook_url : https://github.com/gitpod-io/runbooks/blob/main/runbooks/PrometheusNotConnectedToAlertmanagers.md
1235
- summary : Prometheus is not connected to any Alertmanagers.
1236
- expr : |
1237
- # Without max_over_time, failed scrapes could create false negatives, see
1238
- # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
1239
- max_over_time(prometheus_notifications_alertmanagers_discovered{job="prometheus-k8s",namespace="monitoring-satellite"}[5m]) < 1
1240
- for : 10m
1241
- labels :
1242
- severity : warning
1243
- - alert : PrometheusTSDBReloadsFailing
1244
- annotations :
1245
- description : Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} reload failures over the last 3h.
1246
- runbook_url : https://github.com/gitpod-io/runbooks/blob/main/runbooks/PrometheusTSDBReloadsFailing.md
1247
- summary : Prometheus has issues reloading blocks from disk.
1248
- expr : |
1249
- increase(prometheus_tsdb_reloads_failures_total{job="prometheus-k8s",namespace="monitoring-satellite"}[3h]) > 0
1250
- for : 4h
1251
- labels :
1252
- severity : warning
1253
- - alert : PrometheusTSDBCompactionsFailing
1254
- annotations :
1255
- description : Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} compaction failures over the last 3h.
1256
- runbook_url : https://github.com/gitpod-io/runbooks/blob/main/runbooks/PrometheusTSDBCompactionsFailing.md
1257
- summary : Prometheus has issues compacting blocks.
1258
- expr : |
1259
- increase(prometheus_tsdb_compactions_failed_total{job="prometheus-k8s",namespace="monitoring-satellite"}[3h]) > 0
1260
- for : 4h
1261
- labels :
1262
- severity : warning
1263
- - alert : PrometheusNotIngestingSamples
1264
- annotations :
1265
- description : Prometheus {{$labels.namespace}}/{{$labels.pod}} is not ingesting samples.
1266
- runbook_url : https://github.com/gitpod-io/runbooks/blob/main/runbooks/PrometheusNotIngestingSamples.md
1267
- summary : Prometheus is not ingesting samples.
1268
- expr : |
1269
- (
1270
- rate(prometheus_tsdb_head_samples_appended_total{job="prometheus-k8s",namespace="monitoring-satellite"}[5m]) <= 0
1271
- and
1272
- (
1273
- sum without(scrape_job) (prometheus_target_metadata_cache_entries{job="prometheus-k8s",namespace="monitoring-satellite"}) > 0
1274
- or
1275
- sum without(rule_group) (prometheus_rule_group_rules{job="prometheus-k8s",namespace="monitoring-satellite"}) > 0
1276
- )
1277
- )
1278
- for : 10m
1279
- labels :
1280
- severity : warning
1281
- - alert : PrometheusDuplicateTimestamps
1282
- annotations :
1283
- description : Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{ printf "%.4g" $value }} samples/s with different values but duplicated timestamp.
1284
- runbook_url : https://github.com/gitpod-io/runbooks/blob/main/runbooks/PrometheusDuplicateTimestamps.md
1285
- summary : Prometheus is dropping samples with duplicate timestamps.
1286
- expr : |
1287
- rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-k8s",namespace="monitoring-satellite"}[5m]) > 0
1288
- for : 10m
1289
- labels :
1290
- severity : warning
1291
- - alert : PrometheusOutOfOrderTimestamps
1292
- annotations :
1293
- description : Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{ printf "%.4g" $value }} samples/s with timestamps arriving out of order.
1294
- runbook_url : https://github.com/gitpod-io/runbooks/blob/main/runbooks/PrometheusOutOfOrderTimestamps.md
1295
- summary : Prometheus drops samples with out-of-order timestamps.
1296
- expr : |
1297
- rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus-k8s",namespace="monitoring-satellite"}[5m]) > 0
1298
- for : 10m
1299
- labels :
1300
- severity : warning
1301
- - alert : PrometheusRemoteStorageFailures
1302
- annotations :
1303
- description : Prometheus {{$labels.namespace}}/{{$labels.pod}} failed to send {{ printf "%.1f" $value }}% of the samples to {{ $labels.remote_name}}:{{ $labels.url }}
1304
- runbook_url : https://github.com/gitpod-io/runbooks/blob/main/runbooks/PrometheusRemoteStorageFailures.md
1305
- summary : Prometheus fails to send samples to remote storage.
1306
- expr : |
1307
- (
1308
- (rate(prometheus_remote_storage_failed_samples_total{job="prometheus-k8s",namespace="monitoring-satellite"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="prometheus-k8s",namespace="monitoring-satellite"}[5m]))
1309
- /
1310
- (
1311
- (rate(prometheus_remote_storage_failed_samples_total{job="prometheus-k8s",namespace="monitoring-satellite"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="prometheus-k8s",namespace="monitoring-satellite"}[5m]))
1312
- +
1313
- (rate(prometheus_remote_storage_succeeded_samples_total{job="prometheus-k8s",namespace="monitoring-satellite"}[5m]) or rate(prometheus_remote_storage_samples_total{job="prometheus-k8s",namespace="monitoring-satellite"}[5m]))
1314
- )
1315
- )
1316
- * 100
1317
- > 1
1318
- for : 15m
1319
- labels :
1320
- severity : critical
1321
- - alert : PrometheusRemoteWriteBehind
1322
- annotations :
1323
- description : Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write is {{ printf "%.1f" $value }}s behind for {{ $labels.remote_name}}:{{ $labels.url }}.
1324
- runbook_url : https://github.com/gitpod-io/runbooks/blob/main/runbooks/PrometheusRemoteWriteBehind.md
1325
- summary : Prometheus remote write is behind.
1326
- expr : |
1327
- # Without max_over_time, failed scrapes could create false negatives, see
1328
- # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
1329
- (
1330
- max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job="prometheus-k8s",namespace="monitoring-satellite"}[5m])
1331
- - ignoring(remote_name, url) group_right
1332
- max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job="prometheus-k8s",namespace="monitoring-satellite"}[5m])
1333
- )
1334
- > 120
1335
- for : 15m
1336
- labels :
1337
- severity : critical
1338
- - alert : PrometheusRemoteWriteDesiredShards
1339
- annotations :
1340
- description : Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write desired shards calculation wants to run {{ $value }} shards for queue {{ $labels.remote_name}}:{{ $labels.url }}, which is more than the max of {{ printf `prometheus_remote_storage_shards_max{instance="%s",job="prometheus-k8s",namespace="monitoring-satellite"}` $labels.instance | query | first | value }}.
1341
- runbook_url : https://github.com/gitpod-io/runbooks/blob/main/runbooks/PrometheusRemoteWriteDesiredShards.md
1342
- summary : Prometheus remote write desired shards calculation wants to run more than configured max shards.
1343
- expr : |
1344
- # Without max_over_time, failed scrapes could create false negatives, see
1345
- # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
1346
- (
1347
- max_over_time(prometheus_remote_storage_shards_desired{job="prometheus-k8s",namespace="monitoring-satellite"}[5m])
1348
- >
1349
- max_over_time(prometheus_remote_storage_shards_max{job="prometheus-k8s",namespace="monitoring-satellite"}[5m])
1350
- )
1351
- for : 15m
1352
- labels :
1353
- severity : warning
1354
- - alert : PrometheusRuleFailures
1355
- annotations :
1356
- description : Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to evaluate {{ printf "%.0f" $value }} rules in the last 5m.
1357
- runbook_url : https://github.com/gitpod-io/runbooks/blob/main/runbooks/PrometheusRuleFailures.md
1358
- summary : Prometheus is failing rule evaluations.
1359
- expr : |
1360
- increase(prometheus_rule_evaluation_failures_total{job="prometheus-k8s",namespace="monitoring-satellite"}[5m]) > 0
1361
- for : 15m
1362
- labels :
1363
- severity : warning
1364
- - alert : PrometheusMissingRuleEvaluations
1365
- annotations :
1366
- description : Prometheus {{$labels.namespace}}/{{$labels.pod}} has missed {{ printf "%.0f" $value }} rule group evaluations in the last 5m.
1367
- runbook_url : https://github.com/gitpod-io/runbooks/blob/main/runbooks/PrometheusMissingRuleEvaluations.md
1368
- summary : Prometheus is missing rule evaluations due to slow rule group evaluation.
1369
- expr : |
1370
- increase(prometheus_rule_group_iterations_missed_total{job="prometheus-k8s",namespace="monitoring-satellite"}[5m]) > 0
1371
- for : 15m
1372
- labels :
1373
- severity : warning
1374
- - alert : PrometheusTargetLimitHit
1375
- annotations :
1376
- description : Prometheus {{$labels.namespace}}/{{$labels.pod}} has dropped {{ printf "%.0f" $value }} targets because the number of targets exceeded the configured target_limit.
1377
- runbook_url : https://github.com/gitpod-io/runbooks/blob/main/runbooks/PrometheusTargetLimitHit.md
1378
- summary : Prometheus has dropped targets because some scrape configs have exceeded the targets limit.
1379
- expr : |
1380
- increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job="prometheus-k8s",namespace="monitoring-satellite"}[5m]) > 0
1381
- for : 15m
1382
- labels :
1383
- severity : warning
1384
- - alert : PrometheusLabelLimitHit
1385
- annotations :
1386
- description : Prometheus {{$labels.namespace}}/{{$labels.pod}} has dropped {{ printf "%.0f" $value }} targets because some samples exceeded the configured label_limit, label_name_length_limit or label_value_length_limit.
1387
- runbook_url : https://github.com/gitpod-io/runbooks/blob/main/runbooks/PrometheusLabelLimitHit.md
1388
- summary : Prometheus has dropped targets because some scrape configs have exceeded the labels limit.
1389
- expr : |
1390
- increase(prometheus_target_scrape_pool_exceeded_label_limits_total{job="prometheus-k8s",namespace="monitoring-satellite"}[5m]) > 0
1391
- for : 15m
1392
- labels :
1393
- severity : warning
1394
- - alert : PrometheusScrapeBodySizeLimitHit
1395
- annotations :
1396
- description : Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed {{ printf "%.0f" $value }} scrapes in the last 5m because some targets exceeded the configured body_size_limit.
1397
- runbook_url : https://github.com/gitpod-io/runbooks/blob/main/runbooks/PrometheusScrapeBodySizeLimitHit.md
1398
- summary : Prometheus has dropped some targets that exceeded body size limit.
1399
- expr : |
1400
- increase(prometheus_target_scrapes_exceeded_body_size_limit_total{job="prometheus-k8s",namespace="monitoring-satellite"}[5m]) > 0
1401
- for : 15m
1402
- labels :
1403
- severity : warning
1404
- - alert : PrometheusScrapeSampleLimitHit
1405
- annotations :
1406
- description : Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed {{ printf "%.0f" $value }} scrapes in the last 5m because some targets exceeded the configured sample_limit.
1407
- runbook_url : https://github.com/gitpod-io/runbooks/blob/main/runbooks/PrometheusScrapeSampleLimitHit.md
1408
- summary : Prometheus has failed scrapes that have exceeded the configured sample limit.
1409
- expr : |
1410
- increase(prometheus_target_scrapes_exceeded_sample_limit_total{job="prometheus-k8s",namespace="monitoring-satellite"}[5m]) > 0
1411
- for : 15m
1412
- labels :
1413
- severity : warning
1414
- - alert : PrometheusTargetSyncFailure
1415
- annotations :
1416
- description : ' {{ printf "%.0f" $value }} targets in Prometheus {{$labels.namespace}}/{{$labels.pod}} have failed to sync because invalid configuration was supplied.'
1417
- runbook_url : https://github.com/gitpod-io/runbooks/blob/main/runbooks/PrometheusTargetSyncFailure.md
1418
- summary : Prometheus has failed to sync targets.
1419
- expr : |
1420
- increase(prometheus_target_sync_failed_total{job="prometheus-k8s",namespace="monitoring-satellite"}[30m]) > 0
1421
- for : 5m
1422
- labels :
1423
- severity : critical
1424
- - alert : PrometheusHighQueryLoad
1425
- annotations :
1426
- description : Prometheus {{$labels.namespace}}/{{$labels.pod}} query API has less than 20% available capacity in its query engine for the last 15 minutes.
1427
- runbook_url : https://github.com/gitpod-io/runbooks/blob/main/runbooks/PrometheusHighQueryLoad.md
1428
- summary : Prometheus is reaching its maximum capacity serving concurrent requests.
1429
- expr : |
1430
- avg_over_time(prometheus_engine_queries{job="prometheus-k8s",namespace="monitoring-satellite"}[5m]) / max_over_time(prometheus_engine_queries_concurrent_max{job="prometheus-k8s",namespace="monitoring-satellite"}[5m]) > 0.8
1431
- for : 15m
1432
- labels :
1433
- severity : warning
1434
- - alert : PrometheusErrorSendingAlertsToAnyAlertmanager
1435
- annotations :
1436
- description : ' {{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.'
1437
- runbook_url : https://github.com/gitpod-io/runbooks/blob/main/runbooks/PrometheusErrorSendingAlertsToAnyAlertmanager.md
1438
- summary : Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
1439
- expr : |
1440
- min without (alertmanager) (
1441
- rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="monitoring-satellite",alertmanager!~``}[5m])
1442
- /
1443
- rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="monitoring-satellite",alertmanager!~``}[5m])
1444
- )
1445
- * 100
1446
- > 3
1447
- for : 15m
1448
- labels :
1449
- severity : critical
1186
+ rules : []
1450
1187
- name : prometheus-operator
1451
1188
rules : []
1452
1189
- name : config-reloaders
0 commit comments