|
| 1 | +endpoints: |
| 2 | + - name: "High CPU Usage Check" |
| 3 | + enabled: true |
| 4 | + group: "host" |
| 5 | + url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=avg%20by%20(instance)%20(rate(node_cpu_seconds_total%7Bmode%21%3D%22idle%22%7D%5B2m%5D))" |
| 6 | + method: "GET" |
| 7 | + interval: "30s" |
| 8 | + conditions: |
| 9 | + - "[BODY].data.result[0].value[1] > 0.80" # Extract the second element of 'value' array |
| 10 | + definition: |
| 11 | + title: "High CPU Usage Alert" |
| 12 | + description: "Triggers if CPU usage exceeds 80% for 1 minute." |
| 13 | + alerts: |
| 14 | + - type: custom |
| 15 | + enabled: true |
| 16 | + description: "CPU usage above 80%" |
| 17 | + failure-threshold: 2 |
| 18 | + success-threshold: 1 |
| 19 | + send-on-resolved: true |
| 20 | + |
| 21 | + - name: "Low Available Memory Check" |
| 22 | + enabled: true |
| 23 | + group: "host" |
| 24 | + url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=node_memory_MemAvailable_bytes%20%2F%20node_memory_MemTotal_bytes" |
| 25 | + method: "GET" |
| 26 | + interval: "30s" # Check every 30 seconds |
| 27 | + conditions: |
| 28 | + - "[BODY].data.result[0].value[1] < 0.10" # Less than 10% memory available |
| 29 | + definition: |
| 30 | + title: "Host Out of Memory Alert" |
| 31 | + description: "Triggers if available memory drops below 10% for 2 minutes." |
| 32 | + alerts: |
| 33 | + - type: custom |
| 34 | + enabled: true |
| 35 | + description: "Available memory below 10% for 2 minutes" |
| 36 | + failure-threshold: 4 # (Every 30s * 4 = 2 minutes) |
| 37 | + success-threshold: 1 |
| 38 | + send-on-resolved: true |
| 39 | + |
| 40 | + - name: "Low Disk Space Check" |
| 41 | + enabled: true |
| 42 | + group: "host" |
| 43 | + url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=(node_filesystem_avail_bytes%7Bfstype!~%22^(fuse.*|tmpfs|cifs|nfs)%22%7D%20%2F%20node_filesystem_size_bytes%20%3C%200.10%20and%20on%20(instance%2C%20device%2C%20mountpoint)%20node_filesystem_readonly%20%3D%3D%200)" |
| 44 | + method: "GET" |
| 45 | + interval: "30s" # Check every 30 seconds |
| 46 | + conditions: |
| 47 | + - "[BODY].data.result[0].value[1] < 0.10" # Less than 10% disk space remaining |
| 48 | + definition: |
| 49 | + title: "Host Out of Disk Space Alert" |
| 50 | + description: "Triggers if disk space falls below 10% for 2 minutes." |
| 51 | + alerts: |
| 52 | + - type: custom |
| 53 | + enabled: true |
| 54 | + description: "Disk space below 10% for 2 minutes" |
| 55 | + failure-threshold: 4 # (30s check interval * 4 = 2 minutes) |
| 56 | + success-threshold: 1 |
| 57 | + send-on-resolved: true |
| 58 | + |
| 59 | + - name: "Disk Filling Up Prediction" |
| 60 | + enabled: true |
| 61 | + group: "host" |
| 62 | + url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=predict_linear(node_filesystem_avail_bytes%7Bfstype!~%22^(fuse.*|tmpfs|cifs|nfs)%22%7D%5B1h%5D%2C%2086400)%20%3C%3D%200%20and%20node_filesystem_avail_bytes%20%3E%200" |
| 63 | + method: "GET" |
| 64 | + interval: "30s" # Check every 30 seconds |
| 65 | + conditions: |
| 66 | + - "[BODY].data.result[0].value[1] <= 0" # Predicts disk will be full within 24 hours |
| 67 | + definition: |
| 68 | + title: "Host Disk May Fill in 24 Hours Alert" |
| 69 | + description: "Triggers if the system predicts disk space will be exhausted within 24 hours." |
| 70 | + alerts: |
| 71 | + - type: custom |
| 72 | + enabled: true |
| 73 | + description: "Filesystem predicted to run out of space in 24 hours" |
| 74 | + failure-threshold: 4 # (30s * 4 = 2 minutes) |
| 75 | + success-threshold: 1 |
| 76 | + send-on-resolved: true |
| 77 | + |
| 78 | + - name: "Physical Component Overheating" |
| 79 | + enabled: true |
| 80 | + group: "host" |
| 81 | + url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=node_hwmon_temp_celsius%20%3E%20node_hwmon_temp_max_celsius" |
| 82 | + method: "GET" |
| 83 | + interval: "30s" # Check every 30 seconds |
| 84 | + conditions: |
| 85 | + - "[BODY].data.result[0].value[1] > 0" # Ensures overheating condition is detected |
| 86 | + definition: |
| 87 | + title: "Host Physical Component Overheating Alert" |
| 88 | + description: "Triggers if any hardware component exceeds its maximum temperature threshold." |
| 89 | + alerts: |
| 90 | + - type: custom |
| 91 | + enabled: true |
| 92 | + description: "Hardware component temperature too high for 5 minutes" |
| 93 | + failure-threshold: 10 # (30s * 10 = 5 minutes) |
| 94 | + success-threshold: 1 |
| 95 | + send-on-resolved: true |
| 96 | + |
| 97 | + - name: "Node Overtemperature Alarm" |
| 98 | + enabled: true |
| 99 | + group: "host" |
| 100 | + url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=%28node_hwmon_temp_crit_alarm_celsius%20%3D%3D%201%29%20or%20%28node_hwmon_temp_alarm%20%3D%3D%201%29" |
| 101 | + method: "GET" |
| 102 | + interval: "30s" # Check every 30 seconds |
| 103 | + conditions: |
| 104 | + - "[BODY].data.result[0].value[1] == 1" # Overtemperature alarm triggered |
| 105 | + definition: |
| 106 | + title: "Host Node Overtemperature Alarm" |
| 107 | + description: "Triggers immediately if a critical temperature alarm is raised." |
| 108 | + alerts: |
| 109 | + - type: custom |
| 110 | + enabled: true |
| 111 | + description: "Node temperature alarm triggered" |
| 112 | + failure-threshold: 1 # Immediate alert |
| 113 | + success-threshold: 1 |
| 114 | + send-on-resolved: true |
0 commit comments