Skip to content

Commit 0185ec4

Browse files
fix log alerts (#23)
* fix log alerts * improve max memory usage for networking buf. pool alert * search "Repair-Task" in the repair check * Fixes github pipeline --------- Co-authored-by: Sergio Rua <sergio@axonops.com>
1 parent 2e3e134 commit 0185ec4

File tree

1 file changed

+22
-31
lines changed

1 file changed

+22
-31
lines changed

examples/alerts/config/REPLACE_WITH_ORG_NAME/log_alert_rules.yml

Lines changed: 22 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,15 @@
33
# These rules are some examples of how alerts can be configured for
44
# your Apache Cassandra cluster. The rules defined here will be applied
55
# all clusters.
6-
# Log phrases must be wrapped as: "text to search"
6+
# Log phrases must be wrapped as: \"text to search\"
77

88
axonops_log_alert_rule:
99

1010
- name: Node Down
1111
warning_value: 1
1212
critical_value: 5
1313
duration: 5m
14-
content: "is now DOWN"
14+
content: \"is now DOWN\"
1515
description: Detected node down
1616
source: "/var/log/cassandra/system.log"
1717
present: true
@@ -20,7 +20,7 @@ axonops_log_alert_rule:
2020
warning_value: 1
2121
critical_value: 30
2222
duration: 5m
23-
content: "Invalid or unsupported protocol version"
23+
content: \"Invalid or unsupported protocol version\"
2424
source: "/var/log/cassandra/system.log"
2525
description: Detected clients connecting with invalid or unsupported protocol version
2626
present: true
@@ -30,7 +30,7 @@ axonops_log_alert_rule:
3030
critical_value: 1
3131
operator: '<'
3232
duration: 24h
33-
content: repair
33+
content: \"Repair-Task\"
3434
source: "/var/log/cassandra/system.log"
3535
description: "Detected no repair has been seen in the last 24h"
3636
present: true
@@ -39,7 +39,7 @@ axonops_log_alert_rule:
3939
warning_value: 50
4040
critical_value: 100
4141
duration: 5m
42-
content: "Failed to handshake with peer"
42+
content: \"Failed to handshake with peer\"
4343
source: "/var/log/cassandra/system.log"
4444
description: "Detected TLS handshake error with peer"
4545
present: true
@@ -48,7 +48,7 @@ axonops_log_alert_rule:
4848
warning_value: 1
4949
critical_value: 1
5050
duration: 1m
51-
content: "dropping message of type GOSSIP"
51+
content: \"dropping message of type GOSSIP\"
5252
source: "/var/log/cassandra/system.log"
5353
description: "Detected gossip message drops"
5454
present: true
@@ -57,7 +57,7 @@ axonops_log_alert_rule:
5757
warning_value: 1
5858
critical_value: 1
5959
duration: 5ms
60-
content: "failed stream session"
60+
content: \"failed stream session\"
6161
source: "/var/log/cassandra/system.log"
6262
description: "Detected stream session failure"
6363
present: true
@@ -66,7 +66,7 @@ axonops_log_alert_rule:
6666
warning_value: 1
6767
critical_value: 1
6868
duration: 10s
69-
content: "Corrupt sstable"
69+
content: \"Corrupt sstable\"
7070
source: "/var/log/cassandra/system.log"
7171
description: "Detected SSTable file corruption"
7272
present: true
@@ -75,7 +75,7 @@ axonops_log_alert_rule:
7575
warning_value: 1
7676
critical_value: 1000
7777
duration: 5m
78-
content: "Starting anticompaction"
78+
content: \"Starting anticompaction\"
7979
source: "/var/log/cassandra/system.log"
8080
description: "Detected anticompaction - possibly triggered by an incremental repair"
8181
present: true
@@ -84,7 +84,7 @@ axonops_log_alert_rule:
8484
warning_value: 1
8585
critical_value: 1
8686
duration: 30s
87-
content: "JNA not found"
87+
content: \"JNA not found\"
8888
source: "/var/log/cassandra/system.log"
8989
description: "Missing JNA"
9090
present: true
@@ -93,34 +93,25 @@ axonops_log_alert_rule:
9393
warning_value: 1
9494
critical_value: 1
9595
duration: 15m
96-
content: "Not enough space for compaction"
96+
content: \"Not enough space for compaction\"
9797
source: "/var/log/cassandra/system.log"
9898
description: "Unable to compact due to disk space"
9999
present: true
100100

101101
- name: Maximum memory usage reached for networking buffer pool
102-
warning_value: 1
103-
critical_value: 1
104-
duration: 15m
105-
content: "networking buffer pool, cannot allocate chunk of"
106-
source: "/var/log/cassandra/system.log"
107-
description: "Maximum memory usage reached, increase file_cache_size"
108-
present: true
109-
110-
- name: Maximum memory usage reached for networking buffer pool
111-
warning_value: 1
112-
critical_value: 1
113-
duration: 15m
114-
content: "networking buffer pool, cannot allocate chunk of"
102+
warning_value: 3
103+
critical_value: 20
104+
duration: 60m
105+
content: '+"INFO.*Messaging-EventLoop" +NoSpamLogger.* +"for networking buffer pool" +"cannot allocate chunk"'
115106
source: "/var/log/cassandra/system.log"
116-
description: "Maximum memory usage reached, increase file_cache_size"
107+
description: "Maximum memory usage reached, networking_cache_size needs to be increased"
117108
present: true
118109

119110
- name: Unable to lock JVM memory (ENOMEM)
120111
warning_value: 1
121112
critical_value: 1
122113
duration: 15m
123-
content: "Unable to lock JVM memory (ENOMEM)"
114+
content: \"Unable to lock JVM memory (ENOMEM)\"
124115
source: "/var/log/cassandra/system.log"
125116
description: "Unable to lock JVM memory (ENOMEM), increase RLIMIT_MEMLOCK"
126117
present: true
@@ -129,7 +120,7 @@ axonops_log_alert_rule:
129120
warning_value: 1
130121
critical_value: 1
131122
duration: 15m
132-
content: "Unknown mlockall error"
123+
content: \"Unknown mlockall error\"
133124
source: "/var/log/cassandra/system.log"
134125
description: "Unknown mlockall error"
135126
present: true
@@ -138,7 +129,7 @@ axonops_log_alert_rule:
138129
warning_value: 1
139130
critical_value: 1
140131
duration: 15m
141-
content: "the current operating system"
132+
content: \"the current operating system\"
142133
source: "/var/log/cassandra/system.log"
143134
description: "the current operating system, is unsupported by Cassandra"
144135
present: true
@@ -147,7 +138,7 @@ axonops_log_alert_rule:
147138
warning_value: 1
148139
critical_value: 1
149140
duration: 15m
150-
content: "Obsolete version of JNA present"
141+
content: \"Obsolete version of JNA present\"
151142
source: "/var/log/cassandra/system.log"
152143
description: "Obsolete version of JNA present; unable to read errno. Upgrade to JNA 3.2.7 or later"
153144
present: true
@@ -158,7 +149,7 @@ axonops_log_alert_rule:
158149
warning_value: 1
159150
critical_value: 500
160151
duration: 15m
161-
content: "Writing large partition"
152+
content: \"Writing large partition\"
162153
source: "/var/log/cassandra/system.log"
163154
description: "Cassandra is writing a large partition on disk. This can create issue with reads and repairs. It is suggested to review the schema"
164155
present: true
@@ -180,7 +171,7 @@ axonops_log_alert_rule:
180171
warning_value: 1
181172
critical_value: 500
182173
duration: 15m
183-
content: "jemalloc shared library could not be preloaded to speed up memory allocations"
174+
content: \"jemalloc shared library could not be preloaded to speed up memory allocations\"
184175
source: "/var/log/cassandra/system.log"
185176
description: "Jemalloc shared library could not be preloaded. This can affect performance."
186177
present: true

0 commit comments

Comments
 (0)