Skip to content

Commit 078f0e2

Browse files
author
Pablo Mendez
committed
update notifications
1 parent d1d6930 commit 078f0e2

File tree

1 file changed

+24
-89
lines changed

1 file changed

+24
-89
lines changed

notifications.yaml

Lines changed: 24 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -2,113 +2,48 @@ endpoints:
22
- name: "High CPU Usage Check"
33
enabled: true
44
group: "host"
5-
url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=avg%20by%20(instance)%20(rate(node_cpu_seconds_total%7Bmode%21%3D%22idle%22%7D%5B2m%5D))"
5+
url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=100*avg%20by%20(instance)%20(rate(node_cpu_seconds_total%7Bmode%21%3D%22idle%22%7D%5B2m%5D))"
66
method: "GET"
7-
interval: "30s"
8-
conditions:
9-
- "[BODY].data.result[0].value[1] > 0.80" # Extract the second element of 'value' array
10-
definition:
11-
title: "High CPU Usage Alert"
12-
description: "Triggers if CPU usage exceeds 80% for 1 minute."
13-
alerts:
14-
- type: custom
15-
enabled: true
16-
description: "CPU usage above 80%"
17-
failure-threshold: 2
18-
success-threshold: 1
19-
send-on-resolved: true
20-
21-
- name: "Low Available Memory Check"
22-
enabled: true
23-
group: "host"
24-
url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=node_memory_MemAvailable_bytes%20%2F%20node_memory_MemTotal_bytes"
25-
method: "GET"
26-
interval: "30s" # Check every 30 seconds
7+
interval: "30s"
278
conditions:
28-
- "[BODY].data.result[0].value[1] < 0.10" # Less than 10% memory available
9+
- "[BODY].data.result[0].value[1] > 80"
10+
metric:
11+
min: 0;
12+
max: 100;
13+
unit: "%"
2914
definition:
30-
title: "Host Out of Memory Alert"
31-
description: "Triggers if available memory drops below 10% for 2 minutes."
15+
title: "Configure your CPU Usage Alert"
16+
description: "Triggers if CPU usage exceeds the limit defined in the condition"
3217
alerts:
3318
- type: custom
3419
enabled: true
35-
description: "Available memory below 10% for 2 minutes"
36-
failure-threshold: 4 # (Every 30s * 4 = 2 minutes)
37-
success-threshold: 1
20+
description: "CPU % usage above [CONDITION_VALUE]"
21+
failure-threshold: 2
22+
success-threshold: 1
3823
send-on-resolved: true
3924

40-
- name: "Low Disk Space Check"
25+
- name: "Host out of memory check"
4126
enabled: true
4227
group: "host"
43-
url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=(node_filesystem_avail_bytes%7Bfstype!~%22^(fuse.*|tmpfs|cifs|nfs)%22%7D%20%2F%20node_filesystem_size_bytes%20%3C%200.10%20and%20on%20(instance%2C%20device%2C%20mountpoint)%20node_filesystem_readonly%20%3D%3D%200)"
28+
url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=100*node_memory_MemAvailable_bytes%20%2F%20node_memory_MemTotal_bytes"
4429
method: "GET"
45-
interval: "30s" # Check every 30 seconds
30+
interval: "30s"
4631
conditions:
47-
- "[BODY].data.result[0].value[1] < 0.10" # Less than 10% disk space remaining
32+
- "[BODY].data.result[0].value[1] < 10"
33+
metric:
34+
min: 0;
35+
max: 100;
36+
unit: "%"
4837
definition:
49-
title: "Host Out of Disk Space Alert"
50-
description: "Triggers if disk space falls below 10% for 2 minutes."
38+
title: "Configure your Memory Usage Alert"
39+
description: "Triggers if available memory is below the limit defined in the condition"
5140
alerts:
5241
- type: custom
5342
enabled: true
54-
description: "Disk space below 10% for 2 minutes"
55-
failure-threshold: 4 # (30s check interval * 4 = 2 minutes)
43+
description: "Available memory below [CONDITION_VALUE]"
44+
failure-threshold: 2
5645
success-threshold: 1
5746
send-on-resolved: true
5847

59-
- name: "Disk Filling Up Prediction"
60-
enabled: true
61-
group: "host"
62-
url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=predict_linear(node_filesystem_avail_bytes%7Bfstype!~%22^(fuse.*|tmpfs|cifs|nfs)%22%7D%5B1h%5D%2C%2086400)%20%3C%3D%200%20and%20node_filesystem_avail_bytes%20%3E%200"
63-
method: "GET"
64-
interval: "30s" # Check every 30 seconds
65-
conditions:
66-
- "[BODY].data.result[0].value[1] <= 0" # Predicts disk will be full within 24 hours
67-
definition:
68-
title: "Host Disk May Fill in 24 Hours Alert"
69-
description: "Triggers if the system predicts disk space will be exhausted within 24 hours."
70-
alerts:
71-
- type: custom
72-
enabled: true
73-
description: "Filesystem predicted to run out of space in 24 hours"
74-
failure-threshold: 4 # (30s * 4 = 2 minutes)
75-
success-threshold: 1
76-
send-on-resolved: true
7748

78-
- name: "Physical Component Overheating"
79-
enabled: true
80-
group: "host"
81-
url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=node_hwmon_temp_celsius%20%3E%20node_hwmon_temp_max_celsius"
82-
method: "GET"
83-
interval: "30s" # Check every 30 seconds
84-
conditions:
85-
- "[BODY].data.result[0].value[1] > 0" # Ensures overheating condition is detected
86-
definition:
87-
title: "Host Physical Component Overheating Alert"
88-
description: "Triggers if any hardware component exceeds its maximum temperature threshold."
89-
alerts:
90-
- type: custom
91-
enabled: true
92-
description: "Hardware component temperature too high for 5 minutes"
93-
failure-threshold: 10 # (30s * 10 = 5 minutes)
94-
success-threshold: 1
95-
send-on-resolved: true
9649

97-
- name: "Node Overtemperature Alarm"
98-
enabled: true
99-
group: "host"
100-
url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=%28node_hwmon_temp_crit_alarm_celsius%20%3D%3D%201%29%20or%20%28node_hwmon_temp_alarm%20%3D%3D%201%29"
101-
method: "GET"
102-
interval: "30s" # Check every 30 seconds
103-
conditions:
104-
- "[BODY].data.result[0].value[1] == 1" # Overtemperature alarm triggered
105-
definition:
106-
title: "Host Node Overtemperature Alarm"
107-
description: "Triggers immediately if a critical temperature alarm is raised."
108-
alerts:
109-
- type: custom
110-
enabled: true
111-
description: "Node temperature alarm triggered"
112-
failure-threshold: 1 # Immediate alert
113-
success-threshold: 1
114-
send-on-resolved: true

0 commit comments

Comments
 (0)