From 291b1c1f8245b1a97422527cbb3a34255c57e1b5 Mon Sep 17 00:00:00 2001
From: Pablo Mendez <pablo@dappnode.io>
Date: Tue, 11 Mar 2025 13:53:39 +0100
Subject: [PATCH 01/18] Add notifications

---
 notifications.yaml | 114 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 114 insertions(+)
 create mode 100644 notifications.yaml

diff --git a/notifications.yaml b/notifications.yaml
new file mode 100644
index 0000000..c014da1
--- /dev/null
+++ b/notifications.yaml
@@ -0,0 +1,114 @@
+endpoints:
+  - name: "High CPU Usage Check"
+    enabled: true
+    group: "host"
+    url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=avg%20by%20(instance)%20(rate(node_cpu_seconds_total%7Bmode%21%3D%22idle%22%7D%5B2m%5D))"
+    method: "GET"
+    interval: "30s"  
+    conditions:
+      - "[BODY].data.result[0].value[1] > 0.80"  # Extract the second element of 'value' array
+    definition:
+      title: "High CPU Usage Alert"
+      description: "Triggers if CPU usage exceeds 80% for 1 minute."
+    alerts:
+      - type: custom
+        enabled: true
+        description: "CPU usage above 80%"
+        failure-threshold: 2  
+        success-threshold: 1  
+        send-on-resolved: true
+
+  - name: "Low Available Memory Check"
+    enabled: true
+    group: "host"
+    url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=node_memory_MemAvailable_bytes%20%2F%20node_memory_MemTotal_bytes"
+    method: "GET"
+    interval: "30s"  # Check every 30 seconds
+    conditions:
+      - "[BODY].data.result[0].value[1] < 0.10"  # Less than 10% memory available
+    definition:
+      title: "Host Out of Memory Alert"
+      description: "Triggers if available memory drops below 10% for 2 minutes."
+    alerts:
+      - type: custom
+        enabled: true
+        description: "Available memory below 10% for 2 minutes"
+        failure-threshold: 4  # (Every 30s * 4 = 2 minutes)
+        success-threshold: 1  
+        send-on-resolved: true
+
+  - name: "Low Disk Space Check"
+    enabled: true
+    group: "host"
+    url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=(node_filesystem_avail_bytes%7Bfstype!~%22^(fuse.*|tmpfs|cifs|nfs)%22%7D%20%2F%20node_filesystem_size_bytes%20%3C%200.10%20and%20on%20(instance%2C%20device%2C%20mountpoint)%20node_filesystem_readonly%20%3D%3D%200)"
+    method: "GET"
+    interval: "30s"  # Check every 30 seconds
+    conditions:
+      - "[BODY].data.result[0].value[1] < 0.10"  # Less than 10% disk space remaining
+    definition:
+      title: "Host Out of Disk Space Alert"
+      description: "Triggers if disk space falls below 10% for 2 minutes."
+    alerts:
+      - type: custom
+        enabled: true
+        description: "Disk space below 10% for 2 minutes"
+        failure-threshold: 4  # (30s check interval * 4 = 2 minutes)
+        success-threshold: 1  
+        send-on-resolved: true
+
+  - name: "Disk Filling Up Prediction"
+    enabled: true
+    group: "host"
+    url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=predict_linear(node_filesystem_avail_bytes%7Bfstype!~%22^(fuse.*|tmpfs|cifs|nfs)%22%7D%5B1h%5D%2C%2086400)%20%3C%3D%200%20and%20node_filesystem_avail_bytes%20%3E%200"
+    method: "GET"
+    interval: "30s"  # Check every 30 seconds
+    conditions:
+      - "[BODY].data.result[0].value[1] <= 0"  # Predicts disk will be full within 24 hours
+    definition:
+      title: "Host Disk May Fill in 24 Hours Alert"
+      description: "Triggers if the system predicts disk space will be exhausted within 24 hours."
+    alerts:
+      - type: custom
+        enabled: true
+        description: "Filesystem predicted to run out of space in 24 hours"
+        failure-threshold: 4  # (30s * 4 = 2 minutes)
+        success-threshold: 1  
+        send-on-resolved: true
+
+  - name: "Physical Component Overheating"
+    enabled: true
+    group: "host"
+    url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=node_hwmon_temp_celsius%20%3E%20node_hwmon_temp_max_celsius"
+    method: "GET"
+    interval: "30s"  # Check every 30 seconds
+    conditions:
+      - "[BODY].data.result[0].value[1] > 0"  # Ensures overheating condition is detected
+    definition:
+      title: "Host Physical Component Overheating Alert"
+      description: "Triggers if any hardware component exceeds its maximum temperature threshold."
+    alerts:
+      - type: custom
+        enabled: true
+        description: "Hardware component temperature too high for 5 minutes"
+        failure-threshold: 10  # (30s * 10 = 5 minutes)
+        success-threshold: 1  
+        send-on-resolved: true
+
+  - name: "Node Overtemperature Alarm"
+    enabled: true
+    group: "host"
+    url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=%28node_hwmon_temp_crit_alarm_celsius%20%3D%3D%201%29%20or%20%28node_hwmon_temp_alarm%20%3D%3D%201%29"
+    method: "GET"
+    interval: "30s"  # Check every 30 seconds
+    conditions:
+      - "[BODY].data.result[0].value[1] == 1"  # Overtemperature alarm triggered
+    definition:
+      title: "Host Node Overtemperature Alarm"
+      description: "Triggers immediately if a critical temperature alarm is raised."
+    alerts:
+      - type: custom
+        enabled: true
+        description: "Node temperature alarm triggered"
+        failure-threshold: 1  # Immediate alert
+        success-threshold: 1  
+        send-on-resolved: true

From ace835d691390ae97838c36187f2f0e242b61798 Mon Sep 17 00:00:00 2001
From: Pablo Mendez <pablo@dappnode.io>
Date: Mon, 17 Mar 2025 11:00:04 +0100
Subject: [PATCH 02/18] update notifications

---
 notifications.yaml | 113 ++++++++++-----------------------------------
 1 file changed, 24 insertions(+), 89 deletions(-)

diff --git a/notifications.yaml b/notifications.yaml
index c014da1..b79dab3 100644
--- a/notifications.yaml
+++ b/notifications.yaml
@@ -2,113 +2,48 @@ endpoints:
   - name: "High CPU Usage Check"
     enabled: true
     group: "host"
-    url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=avg%20by%20(instance)%20(rate(node_cpu_seconds_total%7Bmode%21%3D%22idle%22%7D%5B2m%5D))"
+    url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=100*avg%20by%20(instance)%20(rate(node_cpu_seconds_total%7Bmode%21%3D%22idle%22%7D%5B2m%5D))"
     method: "GET"
-    interval: "30s"  
-    conditions:
-      - "[BODY].data.result[0].value[1] > 0.80"  # Extract the second element of 'value' array
-    definition:
-      title: "High CPU Usage Alert"
-      description: "Triggers if CPU usage exceeds 80% for 1 minute."
-    alerts:
-      - type: custom
-        enabled: true
-        description: "CPU usage above 80%"
-        failure-threshold: 2  
-        success-threshold: 1  
-        send-on-resolved: true
-
-  - name: "Low Available Memory Check"
-    enabled: true
-    group: "host"
-    url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=node_memory_MemAvailable_bytes%20%2F%20node_memory_MemTotal_bytes"
-    method: "GET"
-    interval: "30s"  # Check every 30 seconds
+    interval: "30s"
     conditions:
-      - "[BODY].data.result[0].value[1] < 0.10"  # Less than 10% memory available
+      - "[BODY].data.result[0].value[1] > 80"
+    metric: 
+      min: 0;
+      max: 100;
+      unit: "%"
     definition:
-      title: "Host Out of Memory Alert"
-      description: "Triggers if available memory drops below 10% for 2 minutes."
+      title: "Configure your CPU Usage Alert"
+      description: "Triggers if CPU usage exceeds the limit defined in the condition"
     alerts:
       - type: custom
         enabled: true
-        description: "Available memory below 10% for 2 minutes"
-        failure-threshold: 4  # (Every 30s * 4 = 2 minutes)
-        success-threshold: 1  
+        description: "CPU % usage above [CONDITION_VALUE]"
+        failure-threshold: 2
+        success-threshold: 1
         send-on-resolved: true
 
-  - name: "Low Disk Space Check"
+  - name: "Host out of memory check"
     enabled: true
     group: "host"
-    url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=(node_filesystem_avail_bytes%7Bfstype!~%22^(fuse.*|tmpfs|cifs|nfs)%22%7D%20%2F%20node_filesystem_size_bytes%20%3C%200.10%20and%20on%20(instance%2C%20device%2C%20mountpoint)%20node_filesystem_readonly%20%3D%3D%200)"
+    url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=100*node_memory_MemAvailable_bytes%20%2F%20node_memory_MemTotal_bytes"
     method: "GET"
-    interval: "30s"  # Check every 30 seconds
+    interval: "30s"  
     conditions:
-      - "[BODY].data.result[0].value[1] < 0.10"  # Less than 10% disk space remaining
+      - "[BODY].data.result[0].value[1] < 10" 
+    metric: 
+      min: 0;
+      max: 100;
+      unit: "%"
     definition:
-      title: "Host Out of Disk Space Alert"
-      description: "Triggers if disk space falls below 10% for 2 minutes."
+      title: "Configure your Memory Usage Alert"
+      description: "Triggers if available memory is below the limit defined in the condition"
     alerts:
       - type: custom
         enabled: true
-        description: "Disk space below 10% for 2 minutes"
-        failure-threshold: 4  # (30s check interval * 4 = 2 minutes)
+        description: "Available memory below [CONDITION_VALUE]"
+        failure-threshold: 2
         success-threshold: 1  
         send-on-resolved: true
 
-  - name: "Disk Filling Up Prediction"
-    enabled: true
-    group: "host"
-    url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=predict_linear(node_filesystem_avail_bytes%7Bfstype!~%22^(fuse.*|tmpfs|cifs|nfs)%22%7D%5B1h%5D%2C%2086400)%20%3C%3D%200%20and%20node_filesystem_avail_bytes%20%3E%200"
-    method: "GET"
-    interval: "30s"  # Check every 30 seconds
-    conditions:
-      - "[BODY].data.result[0].value[1] <= 0"  # Predicts disk will be full within 24 hours
-    definition:
-      title: "Host Disk May Fill in 24 Hours Alert"
-      description: "Triggers if the system predicts disk space will be exhausted within 24 hours."
-    alerts:
-      - type: custom
-        enabled: true
-        description: "Filesystem predicted to run out of space in 24 hours"
-        failure-threshold: 4  # (30s * 4 = 2 minutes)
-        success-threshold: 1  
-        send-on-resolved: true
 
-  - name: "Physical Component Overheating"
-    enabled: true
-    group: "host"
-    url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=node_hwmon_temp_celsius%20%3E%20node_hwmon_temp_max_celsius"
-    method: "GET"
-    interval: "30s"  # Check every 30 seconds
-    conditions:
-      - "[BODY].data.result[0].value[1] > 0"  # Ensures overheating condition is detected
-    definition:
-      title: "Host Physical Component Overheating Alert"
-      description: "Triggers if any hardware component exceeds its maximum temperature threshold."
-    alerts:
-      - type: custom
-        enabled: true
-        description: "Hardware component temperature too high for 5 minutes"
-        failure-threshold: 10  # (30s * 10 = 5 minutes)
-        success-threshold: 1  
-        send-on-resolved: true
 
-  - name: "Node Overtemperature Alarm"
-    enabled: true
-    group: "host"
-    url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=%28node_hwmon_temp_crit_alarm_celsius%20%3D%3D%201%29%20or%20%28node_hwmon_temp_alarm%20%3D%3D%201%29"
-    method: "GET"
-    interval: "30s"  # Check every 30 seconds
-    conditions:
-      - "[BODY].data.result[0].value[1] == 1"  # Overtemperature alarm triggered
-    definition:
-      title: "Host Node Overtemperature Alarm"
-      description: "Triggers immediately if a critical temperature alarm is raised."
-    alerts:
-      - type: custom
-        enabled: true
-        description: "Node temperature alarm triggered"
-        failure-threshold: 1  # Immediate alert
-        success-threshold: 1  
-        send-on-resolved: true

From 6b55b0e4a191c742e64ff4078cf6a21d69878a6c Mon Sep 17 00:00:00 2001
From: Pablo Mendez <pablo@dappnode.io>
Date: Mon, 17 Mar 2025 11:40:40 +0100
Subject: [PATCH 03/18] edit conditions

---
 notifications.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/notifications.yaml b/notifications.yaml
index b79dab3..787f14c 100644
--- a/notifications.yaml
+++ b/notifications.yaml
@@ -6,7 +6,7 @@ endpoints:
     method: "GET"
     interval: "30s"
     conditions:
-      - "[BODY].data.result[0].value[1] > 80"
+      - "[BODY].data.result[0].value[1] < 80"
     metric: 
       min: 0;
       max: 100;
@@ -29,7 +29,7 @@ endpoints:
     method: "GET"
     interval: "30s"  
     conditions:
-      - "[BODY].data.result[0].value[1] < 10" 
+      - "[BODY].data.result[0].value[1] > 10" 
     metric: 
       min: 0;
       max: 100;

From c9e5fba761781d80315714471eddb084ee8a2e0e Mon Sep 17 00:00:00 2001
From: Pablo Mendez <pablo@dappnode.io>
Date: Mon, 17 Mar 2025 11:49:33 +0100
Subject: [PATCH 04/18] add temperature and host out of disk space metrics

---
 notifications.yaml | 45 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 44 insertions(+), 1 deletion(-)

diff --git a/notifications.yaml b/notifications.yaml
index 787f14c..f4e6aa8 100644
--- a/notifications.yaml
+++ b/notifications.yaml
@@ -45,5 +45,48 @@ endpoints:
         success-threshold: 1  
         send-on-resolved: true
 
+  - name: "Host Out of Disk Space Check"
+    enabled: true
+    group: "host"
+    url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=avg((node_filesystem_avail_bytes%7Bfstype!~%22^(fuse.*|tmpfs|cifs|nfs)%22%7D%20%2F%20node_filesystem_size_bytes)%20*%20100)"
+    method: "GET"
+    interval: "30s"
+    conditions:
+      - "[BODY].data.result[0].value[1] > 10"  # Alert if average available disk space < 10%
+    metric: 
+      min: 0
+      max: 100
+      unit: "%"
+    definition:
+      title: "Configure your Disk Space Alert"
+      description: "Triggers if the available disk space across all instances is below 10%"
+    alerts:
+      - type: custom
+        enabled: true
+        description: "Average disk space is critically low at [CONDITION_VALUE]%"
+        failure-threshold: 2
+        success-threshold: 1
+        send-on-resolved: true
 
-
+  - name: "Host Temperature Check"
+    enabled: true
+    group: "host"
+    url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=avg(node_hwmon_temp_celsius)"
+    method: "GET"
+    interval: "30s"
+    conditions:
+      - "[BODY].data.result[0].value[1] < 85"  
+    metric: 
+      min: 25
+      max: 100
+      unit: "°C"
+    definition:
+      title: "Configure your Temperature Alert"
+      description: "Triggers if the average node temperature exceeds the defined threshold"
+    alerts:
+      - type: custom
+        enabled: true
+        description: "Average node temperature exceeded threshold: [CONDITION_VALUE]°C"
+        failure-threshold: 2
+        success-threshold: 1
+        send-on-resolved: true

From 154092b0581e1651683b1f2c58f023739ee3a556 Mon Sep 17 00:00:00 2001
From: Pablo Mendez <pablo@dappnode.io>
Date: Mon, 17 Mar 2025 14:16:16 +0100
Subject: [PATCH 05/18] fix typo

---
 notifications.yaml | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/notifications.yaml b/notifications.yaml
index f4e6aa8..8e2343a 100644
--- a/notifications.yaml
+++ b/notifications.yaml
@@ -7,9 +7,9 @@ endpoints:
     interval: "30s"
     conditions:
       - "[BODY].data.result[0].value[1] < 80"
-    metric: 
-      min: 0;
-      max: 100;
+    metric:
+      min: 0
+      max: 100
       unit: "%"
     definition:
       title: "Configure your CPU Usage Alert"
@@ -27,12 +27,12 @@ endpoints:
     group: "host"
     url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=100*node_memory_MemAvailable_bytes%20%2F%20node_memory_MemTotal_bytes"
     method: "GET"
-    interval: "30s"  
+    interval: "30s"
     conditions:
-      - "[BODY].data.result[0].value[1] > 10" 
-    metric: 
-      min: 0;
-      max: 100;
+      - "[BODY].data.result[0].value[1] > 10"
+    metric:
+      min: 0
+      max: 100
       unit: "%"
     definition:
       title: "Configure your Memory Usage Alert"
@@ -42,7 +42,7 @@ endpoints:
         enabled: true
         description: "Available memory below [CONDITION_VALUE]"
         failure-threshold: 2
-        success-threshold: 1  
+        success-threshold: 1
         send-on-resolved: true
 
   - name: "Host Out of Disk Space Check"
@@ -52,8 +52,8 @@ endpoints:
     method: "GET"
     interval: "30s"
     conditions:
-      - "[BODY].data.result[0].value[1] > 10"  # Alert if average available disk space < 10%
-    metric: 
+      - "[BODY].data.result[0].value[1] > 10" # Alert if average available disk space < 10%
+    metric:
       min: 0
       max: 100
       unit: "%"
@@ -75,8 +75,8 @@ endpoints:
     method: "GET"
     interval: "30s"
     conditions:
-      - "[BODY].data.result[0].value[1] < 85"  
-    metric: 
+      - "[BODY].data.result[0].value[1] < 85"
+    metric:
       min: 25
       max: 100
       unit: "°C"

From 4cbe261dbfa55c448c686ae01d45ac53cb7d2316 Mon Sep 17 00:00:00 2001
From: Pablo Mendez <pablo@dappnode.io>
Date: Tue, 18 Mar 2025 09:25:43 +0100
Subject: [PATCH 06/18] use average of cpu cores only

---
 notifications.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/notifications.yaml b/notifications.yaml
index 8e2343a..2b8abfc 100644
--- a/notifications.yaml
+++ b/notifications.yaml
@@ -71,7 +71,7 @@ endpoints:
   - name: "Host Temperature Check"
     enabled: true
     group: "host"
-    url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=avg(node_hwmon_temp_celsius)"
+    url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=avg(node_hwmon_temp_celsius%7Bchip%3D~%22coretemp.*%7Ck10temp.*%7Cpci0000%3A.*%7Cacpitz.*%22%7D)"
     method: "GET"
     interval: "30s"
     conditions:

From e87bb2ab659993bec96cb02c237f52711e0fa6a1 Mon Sep 17 00:00:00 2001
From: Pablo Mendez <pablo@dappnode.io>
Date: Tue, 18 Mar 2025 09:26:51 +0100
Subject: [PATCH 07/18] update definition

---
 notifications.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/notifications.yaml b/notifications.yaml
index 2b8abfc..481c434 100644
--- a/notifications.yaml
+++ b/notifications.yaml
@@ -59,7 +59,7 @@ endpoints:
       unit: "%"
     definition:
       title: "Configure your Disk Space Alert"
-      description: "Triggers if the available disk space across all instances is below 10%"
+      description: "Triggers if the available disk space across all instances is below the limit defined in the condition"
     alerts:
       - type: custom
         enabled: true

From 4ee12d4b7a75fd53ecbc813e903fd6227a4fdbfe Mon Sep 17 00:00:00 2001
From: pablomendezroyo <mendez4a@gmail.com>
Date: Mon, 21 Apr 2025 13:01:13 +0200
Subject: [PATCH 08/18] add prio

---
 notifications.yaml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/notifications.yaml b/notifications.yaml
index 481c434..814f9ad 100644
--- a/notifications.yaml
+++ b/notifications.yaml
@@ -14,6 +14,7 @@ endpoints:
     definition:
       title: "Configure your CPU Usage Alert"
       description: "Triggers if CPU usage exceeds the limit defined in the condition"
+    priority: "medium"
     alerts:
       - type: custom
         enabled: true
@@ -37,6 +38,7 @@ endpoints:
     definition:
       title: "Configure your Memory Usage Alert"
       description: "Triggers if available memory is below the limit defined in the condition"
+    priority: "medium"
     alerts:
       - type: custom
         enabled: true
@@ -60,6 +62,7 @@ endpoints:
     definition:
       title: "Configure your Disk Space Alert"
       description: "Triggers if the available disk space across all instances is below the limit defined in the condition"
+    priority: "high"
     alerts:
       - type: custom
         enabled: true
@@ -83,6 +86,7 @@ endpoints:
     definition:
       title: "Configure your Temperature Alert"
       description: "Triggers if the average node temperature exceeds the defined threshold"
+    priority: "medium"
     alerts:
       - type: custom
         enabled: true

From f0693f541a597042742bedb60e0ef84e6adda52a Mon Sep 17 00:00:00 2001
From: Pablo Mendez <pablo@dappnode.io>
Date: Wed, 21 May 2025 23:57:58 +0200
Subject: [PATCH 09/18] add correlationid and banner

---
 notifications.yaml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/notifications.yaml b/notifications.yaml
index 814f9ad..b924d99 100644
--- a/notifications.yaml
+++ b/notifications.yaml
@@ -15,6 +15,8 @@ endpoints:
       title: "Configure your CPU Usage Alert"
       description: "Triggers if CPU usage exceeds the limit defined in the condition"
     priority: "medium"
+    correlationId: "dms-cpu"
+    isBanner: "false"
     alerts:
       - type: custom
         enabled: true
@@ -39,6 +41,8 @@ endpoints:
       title: "Configure your Memory Usage Alert"
       description: "Triggers if available memory is below the limit defined in the condition"
     priority: "medium"
+    correlationId: "dms-memory"
+    isBanner: "false"
     alerts:
       - type: custom
         enabled: true
@@ -63,6 +67,8 @@ endpoints:
       title: "Configure your Disk Space Alert"
       description: "Triggers if the available disk space across all instances is below the limit defined in the condition"
     priority: "high"
+    correlationId: "dms-disk"
+    isBanner: "false"
     alerts:
       - type: custom
         enabled: true
@@ -87,6 +93,8 @@ endpoints:
       title: "Configure your Temperature Alert"
       description: "Triggers if the average node temperature exceeds the defined threshold"
     priority: "medium"
+    correlationId: "dms-temperature"
+    isBanner: "false"
     alerts:
       - type: custom
         enabled: true

From 0427b6b21d0d32962aecca9373872e5fadcf6fa8 Mon Sep 17 00:00:00 2001
From: Pablo Mendez <pablo@dappnode.io>
Date: Thu, 22 May 2025 16:03:29 +0200
Subject: [PATCH 10/18] set disk and temperature as banner

---
 notifications.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/notifications.yaml b/notifications.yaml
index b924d99..32bb45e 100644
--- a/notifications.yaml
+++ b/notifications.yaml
@@ -68,7 +68,7 @@ endpoints:
       description: "Triggers if the available disk space across all instances is below the limit defined in the condition"
     priority: "high"
     correlationId: "dms-disk"
-    isBanner: "false"
+    isBanner: "true"
     alerts:
       - type: custom
         enabled: true
@@ -94,7 +94,7 @@ endpoints:
       description: "Triggers if the average node temperature exceeds the defined threshold"
     priority: "medium"
     correlationId: "dms-temperature"
-    isBanner: "false"
+    isBanner: "true"
     alerts:
       - type: custom
         enabled: true

From 66718cf4f5eeab73b9b3f102ddc97cdff663d0ee Mon Sep 17 00:00:00 2001
From: Pablo Mendez <pablo@dappnode.io>
Date: Thu, 22 May 2025 16:41:24 +0200
Subject: [PATCH 11/18] consistency with evaluation

---
 notifications.yaml | 31 ++++++++++++++++++-------------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/notifications.yaml b/notifications.yaml
index 32bb45e..c996840 100644
--- a/notifications.yaml
+++ b/notifications.yaml
@@ -2,11 +2,11 @@ endpoints:
   - name: "High CPU Usage Check"
     enabled: true
     group: "host"
-    url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=100*avg%20by%20(instance)%20(rate(node_cpu_seconds_total%7Bmode%21%3D%22idle%22%7D%5B2m%5D))"
+    url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=100*avg%20by%20(instance)%20(rate(node_cpu_seconds_total%7Bmode!%3D%22idle%22%7D%5B2m%5D))"
     method: "GET"
     interval: "30s"
     conditions:
-      - "[BODY].data.result[0].value[1] < 80"
+      - "[BODY].data.result[0].value[1] <= 80"
     metric:
       min: 0
       max: 100
@@ -25,28 +25,30 @@ endpoints:
         success-threshold: 1
         send-on-resolved: true
 
-  - name: "Host out of memory check"
+  - name: "Host Out of Memory Check"
     enabled: true
     group: "host"
-    url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=100*node_memory_MemAvailable_bytes%20%2F%20node_memory_MemTotal_bytes"
+    url: >-
+      http://prometheus.dms.dappnode:9090/api/v1/query?query=
+      100*(1 - node_memory_MemAvailable_bytes/node_memory_MemTotal_bytes)
     method: "GET"
     interval: "30s"
     conditions:
-      - "[BODY].data.result[0].value[1] > 10"
+      - "[BODY].data.result[0].value[1] <= 90"
     metric:
       min: 0
       max: 100
       unit: "%"
     definition:
       title: "Configure your Memory Usage Alert"
-      description: "Triggers if available memory is below the limit defined in the condition"
+      description: "Triggers if memory usage exceeds the limit defined in the condition"
     priority: "medium"
     correlationId: "dms-memory"
     isBanner: "false"
     alerts:
       - type: custom
         enabled: true
-        description: "Available memory below [CONDITION_VALUE]"
+        description: "Memory % usage above [CONDITION_VALUE]"
         failure-threshold: 2
         success-threshold: 1
         send-on-resolved: true
@@ -54,25 +56,28 @@ endpoints:
   - name: "Host Out of Disk Space Check"
     enabled: true
     group: "host"
-    url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=avg((node_filesystem_avail_bytes%7Bfstype!~%22^(fuse.*|tmpfs|cifs|nfs)%22%7D%20%2F%20node_filesystem_size_bytes)%20*%20100)"
+    url: >-
+      http://prometheus.dms.dappnode:9090/api/v1/query?query=
+      avg((1 - node_filesystem_avail_bytes%7Bfstype!~%22^(fuse.*|tmpfs|cifs|nfs)%22%7D/
+      node_filesystem_size_bytes)*100)
     method: "GET"
     interval: "30s"
     conditions:
-      - "[BODY].data.result[0].value[1] > 10" # Alert if average available disk space < 10%
+      - "[BODY].data.result[0].value[1] <= 90"
     metric:
       min: 0
       max: 100
       unit: "%"
     definition:
       title: "Configure your Disk Space Alert"
-      description: "Triggers if the available disk space across all instances is below the limit defined in the condition"
+      description: "Triggers if disk usage exceeds the limit defined in the condition"
     priority: "high"
     correlationId: "dms-disk"
     isBanner: "true"
     alerts:
       - type: custom
         enabled: true
-        description: "Average disk space is critically low at [CONDITION_VALUE]%"
+        description: "Disk % usage above [CONDITION_VALUE]"
         failure-threshold: 2
         success-threshold: 1
         send-on-resolved: true
@@ -84,7 +89,7 @@ endpoints:
     method: "GET"
     interval: "30s"
     conditions:
-      - "[BODY].data.result[0].value[1] < 85"
+      - "[BODY].data.result[0].value[1] <= 85"
     metric:
       min: 25
       max: 100
@@ -98,7 +103,7 @@ endpoints:
     alerts:
       - type: custom
         enabled: true
-        description: "Average node temperature exceeded threshold: [CONDITION_VALUE]°C"
+        description: "Average node temperature above [CONDITION_VALUE]°C"
         failure-threshold: 2
         success-threshold: 1
         send-on-resolved: true

From 2b7feec6269a209187839473dc2325906048d285 Mon Sep 17 00:00:00 2001
From: Pablo Mendez <pablo@dappnode.io>
Date: Thu, 22 May 2025 17:02:42 +0200
Subject: [PATCH 12/18] use category hardware

---
 notifications.yaml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/notifications.yaml b/notifications.yaml
index c996840..a6d2d90 100644
--- a/notifications.yaml
+++ b/notifications.yaml
@@ -1,7 +1,7 @@
 endpoints:
   - name: "High CPU Usage Check"
     enabled: true
-    group: "host"
+    group: "hardware"
     url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=100*avg%20by%20(instance)%20(rate(node_cpu_seconds_total%7Bmode!%3D%22idle%22%7D%5B2m%5D))"
     method: "GET"
     interval: "30s"
@@ -27,7 +27,7 @@ endpoints:
 
   - name: "Host Out of Memory Check"
     enabled: true
-    group: "host"
+    group: "hardware"
     url: >-
       http://prometheus.dms.dappnode:9090/api/v1/query?query=
       100*(1 - node_memory_MemAvailable_bytes/node_memory_MemTotal_bytes)
@@ -55,7 +55,7 @@ endpoints:
 
   - name: "Host Out of Disk Space Check"
     enabled: true
-    group: "host"
+    group: "hardware"
     url: >-
       http://prometheus.dms.dappnode:9090/api/v1/query?query=
       avg((1 - node_filesystem_avail_bytes%7Bfstype!~%22^(fuse.*|tmpfs|cifs|nfs)%22%7D/
@@ -84,7 +84,7 @@ endpoints:
 
   - name: "Host Temperature Check"
     enabled: true
-    group: "host"
+    group: "hardware"
     url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=avg(node_hwmon_temp_celsius%7Bchip%3D~%22coretemp.*%7Ck10temp.*%7Cpci0000%3A.*%7Cacpitz.*%22%7D)"
     method: "GET"
     interval: "30s"

From c17cb3b088c3110b75071fe4dee64f2fa60e6069 Mon Sep 17 00:00:00 2001
From: Pablo Mendez <pablo@dappnode.io>
Date: Thu, 22 May 2025 18:15:28 +0200
Subject: [PATCH 13/18] use single line

---
 notifications.yaml | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/notifications.yaml b/notifications.yaml
index a6d2d90..489225c 100644
--- a/notifications.yaml
+++ b/notifications.yaml
@@ -28,9 +28,7 @@ endpoints:
   - name: "Host Out of Memory Check"
     enabled: true
     group: "hardware"
-    url: >-
-      http://prometheus.dms.dappnode:9090/api/v1/query?query=
-      100*(1 - node_memory_MemAvailable_bytes/node_memory_MemTotal_bytes)
+    url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=100*(1 - node_memory_MemAvailable_bytes/node_memory_MemTotal_bytes)"
     method: "GET"
     interval: "30s"
     conditions:
@@ -56,10 +54,7 @@ endpoints:
   - name: "Host Out of Disk Space Check"
     enabled: true
     group: "hardware"
-    url: >-
-      http://prometheus.dms.dappnode:9090/api/v1/query?query=
-      avg((1 - node_filesystem_avail_bytes%7Bfstype!~%22^(fuse.*|tmpfs|cifs|nfs)%22%7D/
-      node_filesystem_size_bytes)*100)
+    url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=avg((1 - node_filesystem_avail_bytes%7Bfstype!~%22^(fuse.*|tmpfs|cifs|nfs)%22%7D/node_filesystem_size_bytes)*100)""
     method: "GET"
     interval: "30s"
     conditions:

From 475316e7e79640dce13b07068986f030d5b4d62e Mon Sep 17 00:00:00 2001
From: Pablo Mendez <pablo@dappnode.io>
Date: Thu, 22 May 2025 18:50:02 +0200
Subject: [PATCH 14/18] dont use white space

---
 notifications.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/notifications.yaml b/notifications.yaml
index 489225c..de8e91b 100644
--- a/notifications.yaml
+++ b/notifications.yaml
@@ -28,7 +28,7 @@ endpoints:
   - name: "Host Out of Memory Check"
     enabled: true
     group: "hardware"
-    url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=100*(1 - node_memory_MemAvailable_bytes/node_memory_MemTotal_bytes)"
+    url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=100*(1-node_memory_MemAvailable_bytes/node_memory_MemTotal_bytes)"
     method: "GET"
     interval: "30s"
     conditions:
@@ -54,7 +54,7 @@ endpoints:
   - name: "Host Out of Disk Space Check"
     enabled: true
     group: "hardware"
-    url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=avg((1 - node_filesystem_avail_bytes%7Bfstype!~%22^(fuse.*|tmpfs|cifs|nfs)%22%7D/node_filesystem_size_bytes)*100)""
+    url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=avg((1-node_filesystem_avail_bytes%7Bfstype!~%22^(fuse.*|tmpfs|cifs|nfs)%22%7D/node_filesystem_size_bytes)*100)"
     method: "GET"
     interval: "30s"
     conditions:

From f974c70560b173a3472f754a9fb4a843c6a5ad0d Mon Sep 17 00:00:00 2001
From: Pablo Mendez <pablo@dappnode.io>
Date: Mon, 26 May 2025 10:16:32 +0200
Subject: [PATCH 15/18] update notifications yaml

---
 notifications.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/notifications.yaml b/notifications.yaml
index de8e91b..4caaa9c 100644
--- a/notifications.yaml
+++ b/notifications.yaml
@@ -2,7 +2,7 @@ endpoints:
   - name: "High CPU Usage Check"
     enabled: true
     group: "hardware"
-    url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=100*avg%20by%20(instance)%20(rate(node_cpu_seconds_total%7Bmode!%3D%22idle%22%7D%5B2m%5D))"
+    url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=100%20*%20sum%20by%28instance%29%20%28rate%28node_cpu_seconds_total%7Bmode!%3D%22idle%22%7D%5B2m%5D%29%29%20%2F%20sum%20by%28instance%29%20%28rate%28node_cpu_seconds_total%5B2m%5D%29%29"
     method: "GET"
     interval: "30s"
     conditions:
@@ -68,7 +68,7 @@ endpoints:
       description: "Triggers if disk usage exceeds the limit defined in the condition"
     priority: "high"
     correlationId: "dms-disk"
-    isBanner: "true"
+    isBanner: "false"
     alerts:
       - type: custom
         enabled: true
@@ -94,7 +94,7 @@ endpoints:
       description: "Triggers if the average node temperature exceeds the defined threshold"
     priority: "medium"
     correlationId: "dms-temperature"
-    isBanner: "true"
+    isBanner: "false"
     alerts:
       - type: custom
         enabled: true

From 0a527f17673bedc5e610b628d6e9be2359d729a3 Mon Sep 17 00:00:00 2001
From: mateumiralles <mateumiralles714@gmail.com>
Date: Wed, 28 May 2025 11:12:33 +0200
Subject: [PATCH 16/18] Update copies

---
 notifications.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/notifications.yaml b/notifications.yaml
index 4caaa9c..dfea045 100644
--- a/notifications.yaml
+++ b/notifications.yaml
@@ -25,7 +25,7 @@ endpoints:
         success-threshold: 1
         send-on-resolved: true
 
-  - name: "Host Out of Memory Check"
+  - name: "Host Memory Check"
     enabled: true
     group: "hardware"
     url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=100*(1-node_memory_MemAvailable_bytes/node_memory_MemTotal_bytes)"
@@ -51,7 +51,7 @@ endpoints:
         success-threshold: 1
         send-on-resolved: true
 
-  - name: "Host Out of Disk Space Check"
+  - name: "Host Disk Space Check"
     enabled: true
     group: "hardware"
     url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=avg((1-node_filesystem_avail_bytes%7Bfstype!~%22^(fuse.*|tmpfs|cifs|nfs)%22%7D/node_filesystem_size_bytes)*100)"

From a7fbd0445cedd089eab87b362031bdff237133b3 Mon Sep 17 00:00:00 2001
From: Marc Font <36164126+Marketen@users.noreply.github.com>
Date: Wed, 28 May 2025 12:02:03 +0200
Subject: [PATCH 17/18] improve cpu query (#84)

---
 notifications.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/notifications.yaml b/notifications.yaml
index dfea045..96b52eb 100644
--- a/notifications.yaml
+++ b/notifications.yaml
@@ -2,7 +2,7 @@ endpoints:
   - name: "High CPU Usage Check"
     enabled: true
     group: "hardware"
-    url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=100%20*%20sum%20by%28instance%29%20%28rate%28node_cpu_seconds_total%7Bmode!%3D%22idle%22%7D%5B2m%5D%29%29%20%2F%20sum%20by%28instance%29%20%28rate%28node_cpu_seconds_total%5B2m%5D%29%29"
+    url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=avg%28node_hwmon_temp_celsius%7Bchip%3D~%22.*coretemp.*%7C.*18_3%24%7C.*k10temp.*%22%7D%29"
     method: "GET"
     interval: "30s"
     conditions:

From 6ff102abcde9edf8d2ed791e317ad75740a13b6c Mon Sep 17 00:00:00 2001
From: Marketen <marcfont12@gmail.com>
Date: Wed, 28 May 2025 12:31:49 +0200
Subject: [PATCH 18/18] fix cpu usange and temp

---
 notifications.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/notifications.yaml b/notifications.yaml
index 96b52eb..3a5e6f3 100644
--- a/notifications.yaml
+++ b/notifications.yaml
@@ -2,7 +2,7 @@ endpoints:
   - name: "High CPU Usage Check"
     enabled: true
     group: "hardware"
-    url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=avg%28node_hwmon_temp_celsius%7Bchip%3D~%22.*coretemp.*%7C.*18_3%24%7C.*k10temp.*%22%7D%29"
+    url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=100%20*%20sum%20by%28instance%29%20%28rate%28node_cpu_seconds_total%7Bmode!%3D%22idle%22%7D%5B2m%5D%29%29%20%2F%20sum%20by%28instance%29%20%28rate%28node_cpu_seconds_total%5B2m%5D%29%29"
     method: "GET"
     interval: "30s"
     conditions:
@@ -80,7 +80,7 @@ endpoints:
   - name: "Host Temperature Check"
     enabled: true
     group: "hardware"
-    url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=avg(node_hwmon_temp_celsius%7Bchip%3D~%22coretemp.*%7Ck10temp.*%7Cpci0000%3A.*%7Cacpitz.*%22%7D)"
+    url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=avg%28node_hwmon_temp_celsius%7Bchip%3D~%22.*coretemp.*%7C.*18_3%24%7C.*k10temp.*%22%7D%29"
     method: "GET"
     interval: "30s"
     conditions: