@@ -6,31 +6,26 @@ groups:
66 - name : latency
77 rules :
88 - alert : Average Latency Warning
9- expr : ' (irate(endpoint_acc_latency{job="redis", cluster="localhost", db="1"}[1m])) / (irate(endpoint_total_started_res{job="redis", cluster="localhost", db="1"}[1m]))'
9+ # this will depend on your use case and what your expected actual latency will be - in this example we are alerting if the average latency is over 2ms
10+ expr : ' (sum by (db)(irate(endpoint_read_requests_latency_histogram_sum[1m]) + irate(endpoint_write_requests_latency_histogram_sum[1m]) + irate(endpoint_other_requests_latency_histogram_sum[1m])))/(sum by (db)(irate(endpoint_read_requests[1m]) + irate(endpoint_write_requests[1m]) + irate(endpoint_other_requests[1m])))/1000 > 2'
1011 for : 30s
1112 labels :
12- severity : notification
13- type : latency
14- job : redis
15- cluster : localhost
16- db : 1
13+ severity : notification
14+ type : latency
1715 annotations :
18- summary : Average Latency Warning
19- description : " High Latency - Cluster: {{$labels.cluster}} DB: {{$labels.db}} Latency: {{$value}} ms"
20- runbook : https://redis.io/docs/latest/integrate/prometheus-with-redis-enterprise/observability/#read-latency
16+ summary : Average Latency Warning
17+ description : " High Latency - DB: {{$labels.db}} Latency: {{$value}} ms"
18+ runbook : https://redis.io/docs/latest/integrate/prometheus-with-redis-enterprise/observability/#read-latency
2119 - alert : Average Latency Critical
22- expr : ' (irate(endpoint_acc_latency{job="redis", cluster="localhost", db="1"} [1m])) / ( irate(endpoint_total_started_res{job="redis", cluster="localhost", db="1"} [1m])) '
20+ expr : ' (sum by (db)( irate(endpoint_read_requests_latency_histogram_sum[1m]) + irate(endpoint_write_requests_latency_histogram_sum [1m]) + irate(endpoint_other_requests_latency_histogram_sum[1m])))/(sum by (db)( irate(endpoint_read_requests[1m]) + irate(endpoint_write_requests [1m]) + irate(endpoint_other_requests[1m])))/1000 > 5 '
2321 for : 30s
2422 labels :
25- severity : critical
26- type : latency
27- job : redis
28- cluster : localhost
29- db : 1
23+ severity : notification
24+ type : latency
3025 annotations :
31- summary : Average Latency Critical
32- description : " High Latency - Cluster: {{$labels.cluster}} DB: {{$labels.db}} Latency: {{$value}} ms"
33- runbook : https://redis.io/docs/latest/integrate/prometheus-with-redis-enterprise/observability/#read-latency
26+ summary : Average Latency Warning
27+ description : " High Latency - DB: {{$labels.db}} Latency: {{$value}} ms"
28+ runbook : https://redis.io/docs/latest/integrate/prometheus-with-redis-enterprise/observability/#read-latency
3429
3530 - name : connections
3631 rules :
@@ -46,7 +41,7 @@ groups:
4641 description : " No Connections - Cluster: {{$labels.cluster}} DB: {{$labels.db}} Connections: {{$value}}"
4742 runbook : https://redis.io/docs/latest/integrate/prometheus-with-redis-enterprise/observability/#connections
4843 - alert : Excessive Connections
49- expr : endpoint_client_connections > 64000
44+ expr : endpoint_client_connections - endpoint_client_disconnections > 64000
5045 for : 30s
5146 labels :
5247 severity : critical
@@ -60,7 +55,7 @@ groups:
6055 - name : throughput
6156 rules :
6257 - alert : No Redis Requests
63- expr : endpoint_read_requests < 1 and endpoint_write_requests < 1
58+ expr : increase( endpoint_read_requests[1m]) < 1 and increase( endpoint_write_requests[1m]) < 1
6459 for : 30s
6560 labels :
6661 severity : critical
@@ -70,7 +65,7 @@ groups:
7065 description : " Too few Redis operations - Cluster: {{$labels.cluster}} DB: {{$labels.db}} {{$value}} (ops/sec)"
7166 runbook : https://redis.io/docs/latest/integrate/prometheus-with-redis-enterprise/observability/#connections
7267 - alert : Excessive Redis Requests
73- expr : (endpoint_read_requests + endpoint_write_requests) /2 > 1000000
68+ expr : (irate( endpoint_read_requests[1m]) + irate( endpoint_write_requests[1m])) /2 > 100000 # The actual threshold will be based on your Redis Database, adjust as needed
7469 for : 30s
7570 labels :
7671 severity : critical
@@ -83,7 +78,7 @@ groups:
8378 - name : capacity
8479 rules :
8580 - alert : DB full
86- expr : round ((redis_server_used_memory{job="redis", cluster="localhost"}/redis_server_memory_limit{job="redis", cluster="localhost"} ) * 100) > 95
81+ expr : ((redis_server_used_memory/redis_server_maxmemory ) * 100 ) > 95
8782 for : 30s
8883 labels :
8984 severity : critical
@@ -93,7 +88,7 @@ groups:
9388 description : " DB Usage - Cluster: {{$labels.cluster}} DB: {{$labels.db}} Usage: {{$value}}% full"
9489 runbook : https://redis.io/docs/latest/integrate/prometheus-with-redis-enterprise/observability/#memory
9590 - alert : DB full in 2 hours
96- expr : round((redis_server_used_memory{job="redis", cluster="localhost"}/redis_server_memory_limit{job="redis", cluster="localhost"} ) * 100) < 95 and (predict_linear(redis_server_used_memory{job="redis", cluster="localhost"} [15m], 2 * 3600) / redis_server_memory_limit{job="redis", cluster="localhost"} ) > 0.3 and round(predict_linear(redis_server_used_memory{job="redis", cluster="localhost"} [15m], 2 * 3600)/redis_server_memory_limit{job="redis", cluster="localhost"} ) > 0.95
91+ expr : round((redis_server_used_memory/redis_server_maxmemory ) * 100) < 95 and (predict_linear(redis_server_used_memory[15m], 2 * 3600) / redis_server_maxmemory ) > 0.3 and round(predict_linear(redis_server_used_memory[15m], 2 * 3600)/redis_server_maxmemory ) > 0.95
9792 for : 30s
9893 labels :
9994 severity : notification
@@ -106,7 +101,7 @@ groups:
106101 - name : utilization
107102 rules :
108103 - alert : Low Hit Ratio
109- expr : (redis_server_keyspace_read_hits{role="master"}/(redis_server_keyspace_read_hits{role="master"} + redis_server_keyspace_read_misses{role="master"})) < 1
104+ expr : (irate( redis_server_keyspace_read_hits{role="master"}[1m])/irate( redis_server_keyspace_read_misses{role="master"}[1m] )) < 1
110105 for : 30s
111106 labels :
112107 severity : notification
@@ -173,7 +168,7 @@ groups:
173168 - name : nodes
174169 rules :
175170 - alert : Node Not Responding
176- expr : count(node_metrics_up) != 3
171+ expr : count(node_metrics_up) != 3 # will depend on the number of nodes in your Redis cluster
177172 for : 5m
178173 labels :
179174 severity : critical
@@ -193,7 +188,7 @@ groups:
193188 summary : Node Persistent Storage
194189 description : " Low on Persistent Storage - Cluster: {{$labels.cluster}} Space Free: {{$value}}%"
195190 - alert : Node Ephemeral Storage
196- expr : round(( node_ephemeral_storage_free_bytes/node_ephemeral_storage_avail_bytes) * 100) <= 5
191+ expr : node_ephemeral_storage_free_bytes < 5000000000 # Depends on your environment how much ephemeral storage you want to trigger on (5GB in this example)
197192 for : 2m
198193 labels :
199194 severity : critical
@@ -202,7 +197,7 @@ groups:
202197 summary : Node Ephemeral Storage
203198 description : " Low on Ephemeral Storage - Cluster: {{$labels.cluster}} Space Free: {{$value}}%"
204199 - alert : Node Free Memory
205- expr : round(( node_available_memory_bytes/node_memory_MemFree_bytes) * 100) <= 15
200+ expr : node_available_memory_bytes < 5000000000 # Depends on your environment how much memory you want to trigger on (5GB in this example)
206201 for : 2m
207202 labels :
208203 severity : critical
0 commit comments