Skip to content

Commit 9488c8e

Browse files
committed
High cpu usage alert now only applies to control and login nodes
1 parent b7d9c48 commit 9488c8e

File tree

2 files changed

+10
-0
lines changed
  • ansible/roles/kube_prometheus_stack/defaults/main
  • environments/common/inventory/group_vars/all

2 files changed

+10
-0
lines changed

ansible/roles/kube_prometheus_stack/defaults/main/helm.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,8 @@ kube_prometheus_stack_release_defaults:
5151
KubeSchedulerDown: true
5252
KubeProxyDown: true
5353
KubeControllerManagerDown: true
54+
# Replaced with appliance specific versions
55+
NodeCPUHighUsage: true
5456
prometheus:
5557
service:
5658
type: NodePort

environments/common/inventory/group_vars/all/prometheus.yml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,3 +44,11 @@ prometheus_extra_alerting_rules:
4444
expr: "slurm_nodes_down > 0\n"
4545
labels:
4646
severity: critical
47+
- alert: NodeCPUHighUsage
48+
annotations:
49+
description: '{% raw %}CPU usage at {{ $labels.instance }} has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%.{% endraw %}'
50+
summary: High CPU usage.
51+
expr: sum without(mode) (avg without (cpu) (rate(node_cpu_seconds_total{job="node-exporter",mode!="idle",instance=~".+(-control|-login).*"}[2m]))) * 100 > 90
52+
for: 15m
53+
labels:
54+
severity: info

0 commit comments

Comments
 (0)