Skip to content

Commit d1d6930

Browse files
author
Pablo Mendez
committed
Add notifications
1 parent f7051af commit d1d6930

File tree

1 file changed

+114
-0
lines changed

1 file changed

+114
-0
lines changed

notifications.yaml

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
endpoints:
2+
- name: "High CPU Usage Check"
3+
enabled: true
4+
group: "host"
5+
url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=avg%20by%20(instance)%20(rate(node_cpu_seconds_total%7Bmode%21%3D%22idle%22%7D%5B2m%5D))"
6+
method: "GET"
7+
interval: "30s"
8+
conditions:
9+
- "[BODY].data.result[0].value[1] > 0.80" # Extract the second element of 'value' array
10+
definition:
11+
title: "High CPU Usage Alert"
12+
description: "Triggers if CPU usage exceeds 80% for 1 minute."
13+
alerts:
14+
- type: custom
15+
enabled: true
16+
description: "CPU usage above 80%"
17+
failure-threshold: 2
18+
success-threshold: 1
19+
send-on-resolved: true
20+
21+
- name: "Low Available Memory Check"
22+
enabled: true
23+
group: "host"
24+
url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=node_memory_MemAvailable_bytes%20%2F%20node_memory_MemTotal_bytes"
25+
method: "GET"
26+
interval: "30s" # Check every 30 seconds
27+
conditions:
28+
- "[BODY].data.result[0].value[1] < 0.10" # Less than 10% memory available
29+
definition:
30+
title: "Host Out of Memory Alert"
31+
description: "Triggers if available memory drops below 10% for 2 minutes."
32+
alerts:
33+
- type: custom
34+
enabled: true
35+
description: "Available memory below 10% for 2 minutes"
36+
failure-threshold: 4 # (Every 30s * 4 = 2 minutes)
37+
success-threshold: 1
38+
send-on-resolved: true
39+
40+
- name: "Low Disk Space Check"
41+
enabled: true
42+
group: "host"
43+
url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=(node_filesystem_avail_bytes%7Bfstype!~%22^(fuse.*|tmpfs|cifs|nfs)%22%7D%20%2F%20node_filesystem_size_bytes%20%3C%200.10%20and%20on%20(instance%2C%20device%2C%20mountpoint)%20node_filesystem_readonly%20%3D%3D%200)"
44+
method: "GET"
45+
interval: "30s" # Check every 30 seconds
46+
conditions:
47+
- "[BODY].data.result[0].value[1] < 0.10" # Less than 10% disk space remaining
48+
definition:
49+
title: "Host Out of Disk Space Alert"
50+
description: "Triggers if disk space falls below 10% for 2 minutes."
51+
alerts:
52+
- type: custom
53+
enabled: true
54+
description: "Disk space below 10% for 2 minutes"
55+
failure-threshold: 4 # (30s check interval * 4 = 2 minutes)
56+
success-threshold: 1
57+
send-on-resolved: true
58+
59+
- name: "Disk Filling Up Prediction"
60+
enabled: true
61+
group: "host"
62+
url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=predict_linear(node_filesystem_avail_bytes%7Bfstype!~%22^(fuse.*|tmpfs|cifs|nfs)%22%7D%5B1h%5D%2C%2086400)%20%3C%3D%200%20and%20node_filesystem_avail_bytes%20%3E%200"
63+
method: "GET"
64+
interval: "30s" # Check every 30 seconds
65+
conditions:
66+
- "[BODY].data.result[0].value[1] <= 0" # Predicts disk will be full within 24 hours
67+
definition:
68+
title: "Host Disk May Fill in 24 Hours Alert"
69+
description: "Triggers if the system predicts disk space will be exhausted within 24 hours."
70+
alerts:
71+
- type: custom
72+
enabled: true
73+
description: "Filesystem predicted to run out of space in 24 hours"
74+
failure-threshold: 4 # (30s * 4 = 2 minutes)
75+
success-threshold: 1
76+
send-on-resolved: true
77+
78+
- name: "Physical Component Overheating"
79+
enabled: true
80+
group: "host"
81+
url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=node_hwmon_temp_celsius%20%3E%20node_hwmon_temp_max_celsius"
82+
method: "GET"
83+
interval: "30s" # Check every 30 seconds
84+
conditions:
85+
- "[BODY].data.result[0].value[1] > 0" # Ensures overheating condition is detected
86+
definition:
87+
title: "Host Physical Component Overheating Alert"
88+
description: "Triggers if any hardware component exceeds its maximum temperature threshold."
89+
alerts:
90+
- type: custom
91+
enabled: true
92+
description: "Hardware component temperature too high for 5 minutes"
93+
failure-threshold: 10 # (30s * 10 = 5 minutes)
94+
success-threshold: 1
95+
send-on-resolved: true
96+
97+
- name: "Node Overtemperature Alarm"
98+
enabled: true
99+
group: "host"
100+
url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=%28node_hwmon_temp_crit_alarm_celsius%20%3D%3D%201%29%20or%20%28node_hwmon_temp_alarm%20%3D%3D%201%29"
101+
method: "GET"
102+
interval: "30s" # Check every 30 seconds
103+
conditions:
104+
- "[BODY].data.result[0].value[1] == 1" # Overtemperature alarm triggered
105+
definition:
106+
title: "Host Node Overtemperature Alarm"
107+
description: "Triggers immediately if a critical temperature alarm is raised."
108+
alerts:
109+
- type: custom
110+
enabled: true
111+
description: "Node temperature alarm triggered"
112+
failure-threshold: 1 # Immediate alert
113+
success-threshold: 1
114+
send-on-resolved: true

0 commit comments

Comments
 (0)