@@ -2,113 +2,48 @@ endpoints:
22 - name : " High CPU Usage Check"
33 enabled : true
44 group : " host"
5- url : " http://prometheus.dms.dappnode:9090/api/v1/query?query=avg%20by%20(instance)%20(rate(node_cpu_seconds_total%7Bmode%21%3D%22idle%22%7D%5B2m%5D))"
5+ url : " http://prometheus.dms.dappnode:9090/api/v1/query?query=100* avg%20by%20(instance)%20(rate(node_cpu_seconds_total%7Bmode%21%3D%22idle%22%7D%5B2m%5D))"
66 method : " GET"
7- interval : " 30s"
8- conditions :
9- - " [BODY].data.result[0].value[1] > 0.80" # Extract the second element of 'value' array
10- definition :
11- title : " High CPU Usage Alert"
12- description : " Triggers if CPU usage exceeds 80% for 1 minute."
13- alerts :
14- - type : custom
15- enabled : true
16- description : " CPU usage above 80%"
17- failure-threshold : 2
18- success-threshold : 1
19- send-on-resolved : true
20-
21- - name : " Low Available Memory Check"
22- enabled : true
23- group : " host"
24- url : " http://prometheus.dms.dappnode:9090/api/v1/query?query=node_memory_MemAvailable_bytes%20%2F%20node_memory_MemTotal_bytes"
25- method : " GET"
26- interval : " 30s" # Check every 30 seconds
7+ interval : " 30s"
278 conditions :
28- - " [BODY].data.result[0].value[1] < 0.10" # Less than 10% memory available
9+ - " [BODY].data.result[0].value[1] > 80"
10+ metric :
11+ min : 0;
12+ max : 100;
13+ unit : " %"
2914 definition :
30- title : " Host Out of Memory Alert"
31- description : " Triggers if available memory drops below 10% for 2 minutes. "
15+ title : " Configure your CPU Usage Alert"
16+ description : " Triggers if CPU usage exceeds the limit defined in the condition "
3217 alerts :
3318 - type : custom
3419 enabled : true
35- description : " Available memory below 10% for 2 minutes "
36- failure-threshold : 4 # (Every 30s * 4 = 2 minutes)
37- success-threshold : 1
20+ description : " CPU % usage above [CONDITION_VALUE] "
21+ failure-threshold : 2
22+ success-threshold : 1
3823 send-on-resolved : true
3924
40- - name : " Low Disk Space Check "
25+ - name : " Host out of memory check "
4126 enabled : true
4227 group : " host"
43- url : " http://prometheus.dms.dappnode:9090/api/v1/query?query=(node_filesystem_avail_bytes%7Bfstype!~%22^(fuse.*|tmpfs|cifs|nfs)%22%7D% 20%2F%20node_filesystem_size_bytes%20%3C%200.10%20and%20on%20(instance%2C%20device%2C%20mountpoint)%20node_filesystem_readonly%20%3D%3D%200) "
28+ url : " http://prometheus.dms.dappnode:9090/api/v1/query?query=100*node_memory_MemAvailable_bytes% 20%2F%20node_memory_MemTotal_bytes "
4429 method : " GET"
45- interval : " 30s" # Check every 30 seconds
30+ interval : " 30s"
4631 conditions :
47- - " [BODY].data.result[0].value[1] < 0.10" # Less than 10% disk space remaining
32+ - " [BODY].data.result[0].value[1] < 10"
33+ metric :
34+ min : 0;
35+ max : 100;
36+ unit : " %"
4837 definition :
49- title : " Host Out of Disk Space Alert"
50- description : " Triggers if disk space falls below 10% for 2 minutes. "
38+ title : " Configure your Memory Usage Alert"
39+ description : " Triggers if available memory is below the limit defined in the condition "
5140 alerts :
5241 - type : custom
5342 enabled : true
54- description : " Disk space below 10% for 2 minutes "
55- failure-threshold : 4 # (30s check interval * 4 = 2 minutes)
43+ description : " Available memory below [CONDITION_VALUE] "
44+ failure-threshold : 2
5645 success-threshold : 1
5746 send-on-resolved : true
5847
59- - name : " Disk Filling Up Prediction"
60- enabled : true
61- group : " host"
62- url : " http://prometheus.dms.dappnode:9090/api/v1/query?query=predict_linear(node_filesystem_avail_bytes%7Bfstype!~%22^(fuse.*|tmpfs|cifs|nfs)%22%7D%5B1h%5D%2C%2086400)%20%3C%3D%200%20and%20node_filesystem_avail_bytes%20%3E%200"
63- method : " GET"
64- interval : " 30s" # Check every 30 seconds
65- conditions :
66- - " [BODY].data.result[0].value[1] <= 0" # Predicts disk will be full within 24 hours
67- definition :
68- title : " Host Disk May Fill in 24 Hours Alert"
69- description : " Triggers if the system predicts disk space will be exhausted within 24 hours."
70- alerts :
71- - type : custom
72- enabled : true
73- description : " Filesystem predicted to run out of space in 24 hours"
74- failure-threshold : 4 # (30s * 4 = 2 minutes)
75- success-threshold : 1
76- send-on-resolved : true
7748
78- - name : " Physical Component Overheating"
79- enabled : true
80- group : " host"
81- url : " http://prometheus.dms.dappnode:9090/api/v1/query?query=node_hwmon_temp_celsius%20%3E%20node_hwmon_temp_max_celsius"
82- method : " GET"
83- interval : " 30s" # Check every 30 seconds
84- conditions :
85- - " [BODY].data.result[0].value[1] > 0" # Ensures overheating condition is detected
86- definition :
87- title : " Host Physical Component Overheating Alert"
88- description : " Triggers if any hardware component exceeds its maximum temperature threshold."
89- alerts :
90- - type : custom
91- enabled : true
92- description : " Hardware component temperature too high for 5 minutes"
93- failure-threshold : 10 # (30s * 10 = 5 minutes)
94- success-threshold : 1
95- send-on-resolved : true
9649
97- - name : " Node Overtemperature Alarm"
98- enabled : true
99- group : " host"
100- url : " http://prometheus.dms.dappnode:9090/api/v1/query?query=%28node_hwmon_temp_crit_alarm_celsius%20%3D%3D%201%29%20or%20%28node_hwmon_temp_alarm%20%3D%3D%201%29"
101- method : " GET"
102- interval : " 30s" # Check every 30 seconds
103- conditions :
104- - " [BODY].data.result[0].value[1] == 1" # Overtemperature alarm triggered
105- definition :
106- title : " Host Node Overtemperature Alarm"
107- description : " Triggers immediately if a critical temperature alarm is raised."
108- alerts :
109- - type : custom
110- enabled : true
111- description : " Node temperature alarm triggered"
112- failure-threshold : 1 # Immediate alert
113- success-threshold : 1
114- send-on-resolved : true
0 commit comments