From 291b1c1f8245b1a97422527cbb3a34255c57e1b5 Mon Sep 17 00:00:00 2001 From: Pablo Mendez Date: Tue, 11 Mar 2025 13:53:39 +0100 Subject: [PATCH 01/18] Add notifications --- notifications.yaml | 114 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 114 insertions(+) create mode 100644 notifications.yaml diff --git a/notifications.yaml b/notifications.yaml new file mode 100644 index 0000000..c014da1 --- /dev/null +++ b/notifications.yaml @@ -0,0 +1,114 @@ +endpoints: + - name: "High CPU Usage Check" + enabled: true + group: "host" + url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=avg%20by%20(instance)%20(rate(node_cpu_seconds_total%7Bmode%21%3D%22idle%22%7D%5B2m%5D))" + method: "GET" + interval: "30s" + conditions: + - "[BODY].data.result[0].value[1] > 0.80" # Extract the second element of 'value' array + definition: + title: "High CPU Usage Alert" + description: "Triggers if CPU usage exceeds 80% for 1 minute." + alerts: + - type: custom + enabled: true + description: "CPU usage above 80%" + failure-threshold: 2 + success-threshold: 1 + send-on-resolved: true + + - name: "Low Available Memory Check" + enabled: true + group: "host" + url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=node_memory_MemAvailable_bytes%20%2F%20node_memory_MemTotal_bytes" + method: "GET" + interval: "30s" # Check every 30 seconds + conditions: + - "[BODY].data.result[0].value[1] < 0.10" # Less than 10% memory available + definition: + title: "Host Out of Memory Alert" + description: "Triggers if available memory drops below 10% for 2 minutes." + alerts: + - type: custom + enabled: true + description: "Available memory below 10% for 2 minutes" + failure-threshold: 4 # (Every 30s * 4 = 2 minutes) + success-threshold: 1 + send-on-resolved: true + + - name: "Low Disk Space Check" + enabled: true + group: "host" + url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=(node_filesystem_avail_bytes%7Bfstype!~%22^(fuse.*|tmpfs|cifs|nfs)%22%7D%20%2F%20node_filesystem_size_bytes%20%3C%200.10%20and%20on%20(instance%2C%20device%2C%20mountpoint)%20node_filesystem_readonly%20%3D%3D%200)" + method: "GET" + interval: "30s" # Check every 30 seconds + conditions: + - "[BODY].data.result[0].value[1] < 0.10" # Less than 10% disk space remaining + definition: + title: "Host Out of Disk Space Alert" + description: "Triggers if disk space falls below 10% for 2 minutes." + alerts: + - type: custom + enabled: true + description: "Disk space below 10% for 2 minutes" + failure-threshold: 4 # (30s check interval * 4 = 2 minutes) + success-threshold: 1 + send-on-resolved: true + + - name: "Disk Filling Up Prediction" + enabled: true + group: "host" + url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=predict_linear(node_filesystem_avail_bytes%7Bfstype!~%22^(fuse.*|tmpfs|cifs|nfs)%22%7D%5B1h%5D%2C%2086400)%20%3C%3D%200%20and%20node_filesystem_avail_bytes%20%3E%200" + method: "GET" + interval: "30s" # Check every 30 seconds + conditions: + - "[BODY].data.result[0].value[1] <= 0" # Predicts disk will be full within 24 hours + definition: + title: "Host Disk May Fill in 24 Hours Alert" + description: "Triggers if the system predicts disk space will be exhausted within 24 hours." + alerts: + - type: custom + enabled: true + description: "Filesystem predicted to run out of space in 24 hours" + failure-threshold: 4 # (30s * 4 = 2 minutes) + success-threshold: 1 + send-on-resolved: true + + - name: "Physical Component Overheating" + enabled: true + group: "host" + url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=node_hwmon_temp_celsius%20%3E%20node_hwmon_temp_max_celsius" + method: "GET" + interval: "30s" # Check every 30 seconds + conditions: + - "[BODY].data.result[0].value[1] > 0" # Ensures overheating condition is detected + definition: + title: "Host Physical Component Overheating Alert" + description: "Triggers if any hardware component exceeds its maximum temperature threshold." + alerts: + - type: custom + enabled: true + description: "Hardware component temperature too high for 5 minutes" + failure-threshold: 10 # (30s * 10 = 5 minutes) + success-threshold: 1 + send-on-resolved: true + + - name: "Node Overtemperature Alarm" + enabled: true + group: "host" + url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=%28node_hwmon_temp_crit_alarm_celsius%20%3D%3D%201%29%20or%20%28node_hwmon_temp_alarm%20%3D%3D%201%29" + method: "GET" + interval: "30s" # Check every 30 seconds + conditions: + - "[BODY].data.result[0].value[1] == 1" # Overtemperature alarm triggered + definition: + title: "Host Node Overtemperature Alarm" + description: "Triggers immediately if a critical temperature alarm is raised." + alerts: + - type: custom + enabled: true + description: "Node temperature alarm triggered" + failure-threshold: 1 # Immediate alert + success-threshold: 1 + send-on-resolved: true From ace835d691390ae97838c36187f2f0e242b61798 Mon Sep 17 00:00:00 2001 From: Pablo Mendez Date: Mon, 17 Mar 2025 11:00:04 +0100 Subject: [PATCH 02/18] update notifications --- notifications.yaml | 113 ++++++++++----------------------------------- 1 file changed, 24 insertions(+), 89 deletions(-) diff --git a/notifications.yaml b/notifications.yaml index c014da1..b79dab3 100644 --- a/notifications.yaml +++ b/notifications.yaml @@ -2,113 +2,48 @@ endpoints: - name: "High CPU Usage Check" enabled: true group: "host" - url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=avg%20by%20(instance)%20(rate(node_cpu_seconds_total%7Bmode%21%3D%22idle%22%7D%5B2m%5D))" + url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=100*avg%20by%20(instance)%20(rate(node_cpu_seconds_total%7Bmode%21%3D%22idle%22%7D%5B2m%5D))" method: "GET" - interval: "30s" - conditions: - - "[BODY].data.result[0].value[1] > 0.80" # Extract the second element of 'value' array - definition: - title: "High CPU Usage Alert" - description: "Triggers if CPU usage exceeds 80% for 1 minute." - alerts: - - type: custom - enabled: true - description: "CPU usage above 80%" - failure-threshold: 2 - success-threshold: 1 - send-on-resolved: true - - - name: "Low Available Memory Check" - enabled: true - group: "host" - url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=node_memory_MemAvailable_bytes%20%2F%20node_memory_MemTotal_bytes" - method: "GET" - interval: "30s" # Check every 30 seconds + interval: "30s" conditions: - - "[BODY].data.result[0].value[1] < 0.10" # Less than 10% memory available + - "[BODY].data.result[0].value[1] > 80" + metric: + min: 0; + max: 100; + unit: "%" definition: - title: "Host Out of Memory Alert" - description: "Triggers if available memory drops below 10% for 2 minutes." + title: "Configure your CPU Usage Alert" + description: "Triggers if CPU usage exceeds the limit defined in the condition" alerts: - type: custom enabled: true - description: "Available memory below 10% for 2 minutes" - failure-threshold: 4 # (Every 30s * 4 = 2 minutes) - success-threshold: 1 + description: "CPU % usage above [CONDITION_VALUE]" + failure-threshold: 2 + success-threshold: 1 send-on-resolved: true - - name: "Low Disk Space Check" + - name: "Host out of memory check" enabled: true group: "host" - url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=(node_filesystem_avail_bytes%7Bfstype!~%22^(fuse.*|tmpfs|cifs|nfs)%22%7D%20%2F%20node_filesystem_size_bytes%20%3C%200.10%20and%20on%20(instance%2C%20device%2C%20mountpoint)%20node_filesystem_readonly%20%3D%3D%200)" + url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=100*node_memory_MemAvailable_bytes%20%2F%20node_memory_MemTotal_bytes" method: "GET" - interval: "30s" # Check every 30 seconds + interval: "30s" conditions: - - "[BODY].data.result[0].value[1] < 0.10" # Less than 10% disk space remaining + - "[BODY].data.result[0].value[1] < 10" + metric: + min: 0; + max: 100; + unit: "%" definition: - title: "Host Out of Disk Space Alert" - description: "Triggers if disk space falls below 10% for 2 minutes." + title: "Configure your Memory Usage Alert" + description: "Triggers if available memory is below the limit defined in the condition" alerts: - type: custom enabled: true - description: "Disk space below 10% for 2 minutes" - failure-threshold: 4 # (30s check interval * 4 = 2 minutes) + description: "Available memory below [CONDITION_VALUE]" + failure-threshold: 2 success-threshold: 1 send-on-resolved: true - - name: "Disk Filling Up Prediction" - enabled: true - group: "host" - url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=predict_linear(node_filesystem_avail_bytes%7Bfstype!~%22^(fuse.*|tmpfs|cifs|nfs)%22%7D%5B1h%5D%2C%2086400)%20%3C%3D%200%20and%20node_filesystem_avail_bytes%20%3E%200" - method: "GET" - interval: "30s" # Check every 30 seconds - conditions: - - "[BODY].data.result[0].value[1] <= 0" # Predicts disk will be full within 24 hours - definition: - title: "Host Disk May Fill in 24 Hours Alert" - description: "Triggers if the system predicts disk space will be exhausted within 24 hours." - alerts: - - type: custom - enabled: true - description: "Filesystem predicted to run out of space in 24 hours" - failure-threshold: 4 # (30s * 4 = 2 minutes) - success-threshold: 1 - send-on-resolved: true - - name: "Physical Component Overheating" - enabled: true - group: "host" - url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=node_hwmon_temp_celsius%20%3E%20node_hwmon_temp_max_celsius" - method: "GET" - interval: "30s" # Check every 30 seconds - conditions: - - "[BODY].data.result[0].value[1] > 0" # Ensures overheating condition is detected - definition: - title: "Host Physical Component Overheating Alert" - description: "Triggers if any hardware component exceeds its maximum temperature threshold." - alerts: - - type: custom - enabled: true - description: "Hardware component temperature too high for 5 minutes" - failure-threshold: 10 # (30s * 10 = 5 minutes) - success-threshold: 1 - send-on-resolved: true - - name: "Node Overtemperature Alarm" - enabled: true - group: "host" - url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=%28node_hwmon_temp_crit_alarm_celsius%20%3D%3D%201%29%20or%20%28node_hwmon_temp_alarm%20%3D%3D%201%29" - method: "GET" - interval: "30s" # Check every 30 seconds - conditions: - - "[BODY].data.result[0].value[1] == 1" # Overtemperature alarm triggered - definition: - title: "Host Node Overtemperature Alarm" - description: "Triggers immediately if a critical temperature alarm is raised." - alerts: - - type: custom - enabled: true - description: "Node temperature alarm triggered" - failure-threshold: 1 # Immediate alert - success-threshold: 1 - send-on-resolved: true From 6b55b0e4a191c742e64ff4078cf6a21d69878a6c Mon Sep 17 00:00:00 2001 From: Pablo Mendez Date: Mon, 17 Mar 2025 11:40:40 +0100 Subject: [PATCH 03/18] edit conditions --- notifications.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/notifications.yaml b/notifications.yaml index b79dab3..787f14c 100644 --- a/notifications.yaml +++ b/notifications.yaml @@ -6,7 +6,7 @@ endpoints: method: "GET" interval: "30s" conditions: - - "[BODY].data.result[0].value[1] > 80" + - "[BODY].data.result[0].value[1] < 80" metric: min: 0; max: 100; @@ -29,7 +29,7 @@ endpoints: method: "GET" interval: "30s" conditions: - - "[BODY].data.result[0].value[1] < 10" + - "[BODY].data.result[0].value[1] > 10" metric: min: 0; max: 100; From c9e5fba761781d80315714471eddb084ee8a2e0e Mon Sep 17 00:00:00 2001 From: Pablo Mendez Date: Mon, 17 Mar 2025 11:49:33 +0100 Subject: [PATCH 04/18] add temperature and host out of disk space metrics --- notifications.yaml | 45 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) diff --git a/notifications.yaml b/notifications.yaml index 787f14c..f4e6aa8 100644 --- a/notifications.yaml +++ b/notifications.yaml @@ -45,5 +45,48 @@ endpoints: success-threshold: 1 send-on-resolved: true + - name: "Host Out of Disk Space Check" + enabled: true + group: "host" + url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=avg((node_filesystem_avail_bytes%7Bfstype!~%22^(fuse.*|tmpfs|cifs|nfs)%22%7D%20%2F%20node_filesystem_size_bytes)%20*%20100)" + method: "GET" + interval: "30s" + conditions: + - "[BODY].data.result[0].value[1] > 10" # Alert if average available disk space < 10% + metric: + min: 0 + max: 100 + unit: "%" + definition: + title: "Configure your Disk Space Alert" + description: "Triggers if the available disk space across all instances is below 10%" + alerts: + - type: custom + enabled: true + description: "Average disk space is critically low at [CONDITION_VALUE]%" + failure-threshold: 2 + success-threshold: 1 + send-on-resolved: true - + - name: "Host Temperature Check" + enabled: true + group: "host" + url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=avg(node_hwmon_temp_celsius)" + method: "GET" + interval: "30s" + conditions: + - "[BODY].data.result[0].value[1] < 85" + metric: + min: 25 + max: 100 + unit: "°C" + definition: + title: "Configure your Temperature Alert" + description: "Triggers if the average node temperature exceeds the defined threshold" + alerts: + - type: custom + enabled: true + description: "Average node temperature exceeded threshold: [CONDITION_VALUE]°C" + failure-threshold: 2 + success-threshold: 1 + send-on-resolved: true From 154092b0581e1651683b1f2c58f023739ee3a556 Mon Sep 17 00:00:00 2001 From: Pablo Mendez Date: Mon, 17 Mar 2025 14:16:16 +0100 Subject: [PATCH 05/18] fix typo --- notifications.yaml | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/notifications.yaml b/notifications.yaml index f4e6aa8..8e2343a 100644 --- a/notifications.yaml +++ b/notifications.yaml @@ -7,9 +7,9 @@ endpoints: interval: "30s" conditions: - "[BODY].data.result[0].value[1] < 80" - metric: - min: 0; - max: 100; + metric: + min: 0 + max: 100 unit: "%" definition: title: "Configure your CPU Usage Alert" @@ -27,12 +27,12 @@ endpoints: group: "host" url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=100*node_memory_MemAvailable_bytes%20%2F%20node_memory_MemTotal_bytes" method: "GET" - interval: "30s" + interval: "30s" conditions: - - "[BODY].data.result[0].value[1] > 10" - metric: - min: 0; - max: 100; + - "[BODY].data.result[0].value[1] > 10" + metric: + min: 0 + max: 100 unit: "%" definition: title: "Configure your Memory Usage Alert" @@ -42,7 +42,7 @@ endpoints: enabled: true description: "Available memory below [CONDITION_VALUE]" failure-threshold: 2 - success-threshold: 1 + success-threshold: 1 send-on-resolved: true - name: "Host Out of Disk Space Check" @@ -52,8 +52,8 @@ endpoints: method: "GET" interval: "30s" conditions: - - "[BODY].data.result[0].value[1] > 10" # Alert if average available disk space < 10% - metric: + - "[BODY].data.result[0].value[1] > 10" # Alert if average available disk space < 10% + metric: min: 0 max: 100 unit: "%" @@ -75,8 +75,8 @@ endpoints: method: "GET" interval: "30s" conditions: - - "[BODY].data.result[0].value[1] < 85" - metric: + - "[BODY].data.result[0].value[1] < 85" + metric: min: 25 max: 100 unit: "°C" From 4cbe261dbfa55c448c686ae01d45ac53cb7d2316 Mon Sep 17 00:00:00 2001 From: Pablo Mendez Date: Tue, 18 Mar 2025 09:25:43 +0100 Subject: [PATCH 06/18] use average of cpu cores only --- notifications.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notifications.yaml b/notifications.yaml index 8e2343a..2b8abfc 100644 --- a/notifications.yaml +++ b/notifications.yaml @@ -71,7 +71,7 @@ endpoints: - name: "Host Temperature Check" enabled: true group: "host" - url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=avg(node_hwmon_temp_celsius)" + url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=avg(node_hwmon_temp_celsius%7Bchip%3D~%22coretemp.*%7Ck10temp.*%7Cpci0000%3A.*%7Cacpitz.*%22%7D)" method: "GET" interval: "30s" conditions: From e87bb2ab659993bec96cb02c237f52711e0fa6a1 Mon Sep 17 00:00:00 2001 From: Pablo Mendez Date: Tue, 18 Mar 2025 09:26:51 +0100 Subject: [PATCH 07/18] update definition --- notifications.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notifications.yaml b/notifications.yaml index 2b8abfc..481c434 100644 --- a/notifications.yaml +++ b/notifications.yaml @@ -59,7 +59,7 @@ endpoints: unit: "%" definition: title: "Configure your Disk Space Alert" - description: "Triggers if the available disk space across all instances is below 10%" + description: "Triggers if the available disk space across all instances is below the limit defined in the condition" alerts: - type: custom enabled: true From 4ee12d4b7a75fd53ecbc813e903fd6227a4fdbfe Mon Sep 17 00:00:00 2001 From: pablomendezroyo Date: Mon, 21 Apr 2025 13:01:13 +0200 Subject: [PATCH 08/18] add prio --- notifications.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/notifications.yaml b/notifications.yaml index 481c434..814f9ad 100644 --- a/notifications.yaml +++ b/notifications.yaml @@ -14,6 +14,7 @@ endpoints: definition: title: "Configure your CPU Usage Alert" description: "Triggers if CPU usage exceeds the limit defined in the condition" + priority: "medium" alerts: - type: custom enabled: true @@ -37,6 +38,7 @@ endpoints: definition: title: "Configure your Memory Usage Alert" description: "Triggers if available memory is below the limit defined in the condition" + priority: "medium" alerts: - type: custom enabled: true @@ -60,6 +62,7 @@ endpoints: definition: title: "Configure your Disk Space Alert" description: "Triggers if the available disk space across all instances is below the limit defined in the condition" + priority: "high" alerts: - type: custom enabled: true @@ -83,6 +86,7 @@ endpoints: definition: title: "Configure your Temperature Alert" description: "Triggers if the average node temperature exceeds the defined threshold" + priority: "medium" alerts: - type: custom enabled: true From f0693f541a597042742bedb60e0ef84e6adda52a Mon Sep 17 00:00:00 2001 From: Pablo Mendez Date: Wed, 21 May 2025 23:57:58 +0200 Subject: [PATCH 09/18] add correlationid and banner --- notifications.yaml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/notifications.yaml b/notifications.yaml index 814f9ad..b924d99 100644 --- a/notifications.yaml +++ b/notifications.yaml @@ -15,6 +15,8 @@ endpoints: title: "Configure your CPU Usage Alert" description: "Triggers if CPU usage exceeds the limit defined in the condition" priority: "medium" + correlationId: "dms-cpu" + isBanner: "false" alerts: - type: custom enabled: true @@ -39,6 +41,8 @@ endpoints: title: "Configure your Memory Usage Alert" description: "Triggers if available memory is below the limit defined in the condition" priority: "medium" + correlationId: "dms-memory" + isBanner: "false" alerts: - type: custom enabled: true @@ -63,6 +67,8 @@ endpoints: title: "Configure your Disk Space Alert" description: "Triggers if the available disk space across all instances is below the limit defined in the condition" priority: "high" + correlationId: "dms-disk" + isBanner: "false" alerts: - type: custom enabled: true @@ -87,6 +93,8 @@ endpoints: title: "Configure your Temperature Alert" description: "Triggers if the average node temperature exceeds the defined threshold" priority: "medium" + correlationId: "dms-temperature" + isBanner: "false" alerts: - type: custom enabled: true From 0427b6b21d0d32962aecca9373872e5fadcf6fa8 Mon Sep 17 00:00:00 2001 From: Pablo Mendez Date: Thu, 22 May 2025 16:03:29 +0200 Subject: [PATCH 10/18] set disk and temperature as banner --- notifications.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/notifications.yaml b/notifications.yaml index b924d99..32bb45e 100644 --- a/notifications.yaml +++ b/notifications.yaml @@ -68,7 +68,7 @@ endpoints: description: "Triggers if the available disk space across all instances is below the limit defined in the condition" priority: "high" correlationId: "dms-disk" - isBanner: "false" + isBanner: "true" alerts: - type: custom enabled: true @@ -94,7 +94,7 @@ endpoints: description: "Triggers if the average node temperature exceeds the defined threshold" priority: "medium" correlationId: "dms-temperature" - isBanner: "false" + isBanner: "true" alerts: - type: custom enabled: true From 66718cf4f5eeab73b9b3f102ddc97cdff663d0ee Mon Sep 17 00:00:00 2001 From: Pablo Mendez Date: Thu, 22 May 2025 16:41:24 +0200 Subject: [PATCH 11/18] consistency with evaluation --- notifications.yaml | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/notifications.yaml b/notifications.yaml index 32bb45e..c996840 100644 --- a/notifications.yaml +++ b/notifications.yaml @@ -2,11 +2,11 @@ endpoints: - name: "High CPU Usage Check" enabled: true group: "host" - url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=100*avg%20by%20(instance)%20(rate(node_cpu_seconds_total%7Bmode%21%3D%22idle%22%7D%5B2m%5D))" + url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=100*avg%20by%20(instance)%20(rate(node_cpu_seconds_total%7Bmode!%3D%22idle%22%7D%5B2m%5D))" method: "GET" interval: "30s" conditions: - - "[BODY].data.result[0].value[1] < 80" + - "[BODY].data.result[0].value[1] <= 80" metric: min: 0 max: 100 @@ -25,28 +25,30 @@ endpoints: success-threshold: 1 send-on-resolved: true - - name: "Host out of memory check" + - name: "Host Out of Memory Check" enabled: true group: "host" - url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=100*node_memory_MemAvailable_bytes%20%2F%20node_memory_MemTotal_bytes" + url: >- + http://prometheus.dms.dappnode:9090/api/v1/query?query= + 100*(1 - node_memory_MemAvailable_bytes/node_memory_MemTotal_bytes) method: "GET" interval: "30s" conditions: - - "[BODY].data.result[0].value[1] > 10" + - "[BODY].data.result[0].value[1] <= 90" metric: min: 0 max: 100 unit: "%" definition: title: "Configure your Memory Usage Alert" - description: "Triggers if available memory is below the limit defined in the condition" + description: "Triggers if memory usage exceeds the limit defined in the condition" priority: "medium" correlationId: "dms-memory" isBanner: "false" alerts: - type: custom enabled: true - description: "Available memory below [CONDITION_VALUE]" + description: "Memory % usage above [CONDITION_VALUE]" failure-threshold: 2 success-threshold: 1 send-on-resolved: true @@ -54,25 +56,28 @@ endpoints: - name: "Host Out of Disk Space Check" enabled: true group: "host" - url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=avg((node_filesystem_avail_bytes%7Bfstype!~%22^(fuse.*|tmpfs|cifs|nfs)%22%7D%20%2F%20node_filesystem_size_bytes)%20*%20100)" + url: >- + http://prometheus.dms.dappnode:9090/api/v1/query?query= + avg((1 - node_filesystem_avail_bytes%7Bfstype!~%22^(fuse.*|tmpfs|cifs|nfs)%22%7D/ + node_filesystem_size_bytes)*100) method: "GET" interval: "30s" conditions: - - "[BODY].data.result[0].value[1] > 10" # Alert if average available disk space < 10% + - "[BODY].data.result[0].value[1] <= 90" metric: min: 0 max: 100 unit: "%" definition: title: "Configure your Disk Space Alert" - description: "Triggers if the available disk space across all instances is below the limit defined in the condition" + description: "Triggers if disk usage exceeds the limit defined in the condition" priority: "high" correlationId: "dms-disk" isBanner: "true" alerts: - type: custom enabled: true - description: "Average disk space is critically low at [CONDITION_VALUE]%" + description: "Disk % usage above [CONDITION_VALUE]" failure-threshold: 2 success-threshold: 1 send-on-resolved: true @@ -84,7 +89,7 @@ endpoints: method: "GET" interval: "30s" conditions: - - "[BODY].data.result[0].value[1] < 85" + - "[BODY].data.result[0].value[1] <= 85" metric: min: 25 max: 100 @@ -98,7 +103,7 @@ endpoints: alerts: - type: custom enabled: true - description: "Average node temperature exceeded threshold: [CONDITION_VALUE]°C" + description: "Average node temperature above [CONDITION_VALUE]°C" failure-threshold: 2 success-threshold: 1 send-on-resolved: true From 2b7feec6269a209187839473dc2325906048d285 Mon Sep 17 00:00:00 2001 From: Pablo Mendez Date: Thu, 22 May 2025 17:02:42 +0200 Subject: [PATCH 12/18] use category hardware --- notifications.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/notifications.yaml b/notifications.yaml index c996840..a6d2d90 100644 --- a/notifications.yaml +++ b/notifications.yaml @@ -1,7 +1,7 @@ endpoints: - name: "High CPU Usage Check" enabled: true - group: "host" + group: "hardware" url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=100*avg%20by%20(instance)%20(rate(node_cpu_seconds_total%7Bmode!%3D%22idle%22%7D%5B2m%5D))" method: "GET" interval: "30s" @@ -27,7 +27,7 @@ endpoints: - name: "Host Out of Memory Check" enabled: true - group: "host" + group: "hardware" url: >- http://prometheus.dms.dappnode:9090/api/v1/query?query= 100*(1 - node_memory_MemAvailable_bytes/node_memory_MemTotal_bytes) @@ -55,7 +55,7 @@ endpoints: - name: "Host Out of Disk Space Check" enabled: true - group: "host" + group: "hardware" url: >- http://prometheus.dms.dappnode:9090/api/v1/query?query= avg((1 - node_filesystem_avail_bytes%7Bfstype!~%22^(fuse.*|tmpfs|cifs|nfs)%22%7D/ @@ -84,7 +84,7 @@ endpoints: - name: "Host Temperature Check" enabled: true - group: "host" + group: "hardware" url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=avg(node_hwmon_temp_celsius%7Bchip%3D~%22coretemp.*%7Ck10temp.*%7Cpci0000%3A.*%7Cacpitz.*%22%7D)" method: "GET" interval: "30s" From c17cb3b088c3110b75071fe4dee64f2fa60e6069 Mon Sep 17 00:00:00 2001 From: Pablo Mendez Date: Thu, 22 May 2025 18:15:28 +0200 Subject: [PATCH 13/18] use single line --- notifications.yaml | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/notifications.yaml b/notifications.yaml index a6d2d90..489225c 100644 --- a/notifications.yaml +++ b/notifications.yaml @@ -28,9 +28,7 @@ endpoints: - name: "Host Out of Memory Check" enabled: true group: "hardware" - url: >- - http://prometheus.dms.dappnode:9090/api/v1/query?query= - 100*(1 - node_memory_MemAvailable_bytes/node_memory_MemTotal_bytes) + url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=100*(1 - node_memory_MemAvailable_bytes/node_memory_MemTotal_bytes)" method: "GET" interval: "30s" conditions: @@ -56,10 +54,7 @@ endpoints: - name: "Host Out of Disk Space Check" enabled: true group: "hardware" - url: >- - http://prometheus.dms.dappnode:9090/api/v1/query?query= - avg((1 - node_filesystem_avail_bytes%7Bfstype!~%22^(fuse.*|tmpfs|cifs|nfs)%22%7D/ - node_filesystem_size_bytes)*100) + url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=avg((1 - node_filesystem_avail_bytes%7Bfstype!~%22^(fuse.*|tmpfs|cifs|nfs)%22%7D/node_filesystem_size_bytes)*100)"" method: "GET" interval: "30s" conditions: From 475316e7e79640dce13b07068986f030d5b4d62e Mon Sep 17 00:00:00 2001 From: Pablo Mendez Date: Thu, 22 May 2025 18:50:02 +0200 Subject: [PATCH 14/18] dont use white space --- notifications.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/notifications.yaml b/notifications.yaml index 489225c..de8e91b 100644 --- a/notifications.yaml +++ b/notifications.yaml @@ -28,7 +28,7 @@ endpoints: - name: "Host Out of Memory Check" enabled: true group: "hardware" - url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=100*(1 - node_memory_MemAvailable_bytes/node_memory_MemTotal_bytes)" + url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=100*(1-node_memory_MemAvailable_bytes/node_memory_MemTotal_bytes)" method: "GET" interval: "30s" conditions: @@ -54,7 +54,7 @@ endpoints: - name: "Host Out of Disk Space Check" enabled: true group: "hardware" - url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=avg((1 - node_filesystem_avail_bytes%7Bfstype!~%22^(fuse.*|tmpfs|cifs|nfs)%22%7D/node_filesystem_size_bytes)*100)"" + url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=avg((1-node_filesystem_avail_bytes%7Bfstype!~%22^(fuse.*|tmpfs|cifs|nfs)%22%7D/node_filesystem_size_bytes)*100)" method: "GET" interval: "30s" conditions: From f974c70560b173a3472f754a9fb4a843c6a5ad0d Mon Sep 17 00:00:00 2001 From: Pablo Mendez Date: Mon, 26 May 2025 10:16:32 +0200 Subject: [PATCH 15/18] update notifications yaml --- notifications.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/notifications.yaml b/notifications.yaml index de8e91b..4caaa9c 100644 --- a/notifications.yaml +++ b/notifications.yaml @@ -2,7 +2,7 @@ endpoints: - name: "High CPU Usage Check" enabled: true group: "hardware" - url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=100*avg%20by%20(instance)%20(rate(node_cpu_seconds_total%7Bmode!%3D%22idle%22%7D%5B2m%5D))" + url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=100%20*%20sum%20by%28instance%29%20%28rate%28node_cpu_seconds_total%7Bmode!%3D%22idle%22%7D%5B2m%5D%29%29%20%2F%20sum%20by%28instance%29%20%28rate%28node_cpu_seconds_total%5B2m%5D%29%29" method: "GET" interval: "30s" conditions: @@ -68,7 +68,7 @@ endpoints: description: "Triggers if disk usage exceeds the limit defined in the condition" priority: "high" correlationId: "dms-disk" - isBanner: "true" + isBanner: "false" alerts: - type: custom enabled: true @@ -94,7 +94,7 @@ endpoints: description: "Triggers if the average node temperature exceeds the defined threshold" priority: "medium" correlationId: "dms-temperature" - isBanner: "true" + isBanner: "false" alerts: - type: custom enabled: true From 0a527f17673bedc5e610b628d6e9be2359d729a3 Mon Sep 17 00:00:00 2001 From: mateumiralles Date: Wed, 28 May 2025 11:12:33 +0200 Subject: [PATCH 16/18] Update copies --- notifications.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/notifications.yaml b/notifications.yaml index 4caaa9c..dfea045 100644 --- a/notifications.yaml +++ b/notifications.yaml @@ -25,7 +25,7 @@ endpoints: success-threshold: 1 send-on-resolved: true - - name: "Host Out of Memory Check" + - name: "Host Memory Check" enabled: true group: "hardware" url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=100*(1-node_memory_MemAvailable_bytes/node_memory_MemTotal_bytes)" @@ -51,7 +51,7 @@ endpoints: success-threshold: 1 send-on-resolved: true - - name: "Host Out of Disk Space Check" + - name: "Host Disk Space Check" enabled: true group: "hardware" url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=avg((1-node_filesystem_avail_bytes%7Bfstype!~%22^(fuse.*|tmpfs|cifs|nfs)%22%7D/node_filesystem_size_bytes)*100)" From a7fbd0445cedd089eab87b362031bdff237133b3 Mon Sep 17 00:00:00 2001 From: Marc Font <36164126+Marketen@users.noreply.github.com> Date: Wed, 28 May 2025 12:02:03 +0200 Subject: [PATCH 17/18] improve cpu query (#84) --- notifications.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notifications.yaml b/notifications.yaml index dfea045..96b52eb 100644 --- a/notifications.yaml +++ b/notifications.yaml @@ -2,7 +2,7 @@ endpoints: - name: "High CPU Usage Check" enabled: true group: "hardware" - url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=100%20*%20sum%20by%28instance%29%20%28rate%28node_cpu_seconds_total%7Bmode!%3D%22idle%22%7D%5B2m%5D%29%29%20%2F%20sum%20by%28instance%29%20%28rate%28node_cpu_seconds_total%5B2m%5D%29%29" + url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=avg%28node_hwmon_temp_celsius%7Bchip%3D~%22.*coretemp.*%7C.*18_3%24%7C.*k10temp.*%22%7D%29" method: "GET" interval: "30s" conditions: From 6ff102abcde9edf8d2ed791e317ad75740a13b6c Mon Sep 17 00:00:00 2001 From: Marketen Date: Wed, 28 May 2025 12:31:49 +0200 Subject: [PATCH 18/18] fix cpu usange and temp --- notifications.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/notifications.yaml b/notifications.yaml index 96b52eb..3a5e6f3 100644 --- a/notifications.yaml +++ b/notifications.yaml @@ -2,7 +2,7 @@ endpoints: - name: "High CPU Usage Check" enabled: true group: "hardware" - url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=avg%28node_hwmon_temp_celsius%7Bchip%3D~%22.*coretemp.*%7C.*18_3%24%7C.*k10temp.*%22%7D%29" + url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=100%20*%20sum%20by%28instance%29%20%28rate%28node_cpu_seconds_total%7Bmode!%3D%22idle%22%7D%5B2m%5D%29%29%20%2F%20sum%20by%28instance%29%20%28rate%28node_cpu_seconds_total%5B2m%5D%29%29" method: "GET" interval: "30s" conditions: @@ -80,7 +80,7 @@ endpoints: - name: "Host Temperature Check" enabled: true group: "hardware" - url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=avg(node_hwmon_temp_celsius%7Bchip%3D~%22coretemp.*%7Ck10temp.*%7Cpci0000%3A.*%7Cacpitz.*%22%7D)" + url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=avg%28node_hwmon_temp_celsius%7Bchip%3D~%22.*coretemp.*%7C.*18_3%24%7C.*k10temp.*%22%7D%29" method: "GET" interval: "30s" conditions: