Skip to content

Commit 2ba5dd9

Browse files
committed
updates to v2 alerts
1 parent 62407c4 commit 2ba5dd9

File tree

9 files changed

+60
-62
lines changed

9 files changed

+60
-62
lines changed

grafana_v2/kickstart_v2/docker-compose.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@ services:
2727
- prometheus
2828
extra_hosts:
2929
- "host.docker.internal:host-gateway"
30+
environment:
31+
- GF_SECURITY_ADMIN_PASSWORD=very-secret-password
3032
networks:
3133
kickstart:
3234
ipv4_address: "172.27.2.3"

grafana_v2/kickstart_v2/terraform/gcp/main.tf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -205,7 +205,7 @@ resource "null_resource" "run-kickstart" {
205205
provisioner "remote-exec" {
206206
inline = [
207207
"cd redis-enterprise-observability/grafana_v2/kickstart_v2",
208-
"git checkout main",
208+
"git checkout ${var.git_branch}",
209209
"./setup.sh ${local.redis_db_primary_fqdn} ../dashboards/grafana_v9-11/cloud/basic",
210210
]
211211
}

grafana_v2/kickstart_v2/terraform/gcp/variables.tf

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,4 +85,10 @@ variable "existing_vpc_name" {
8585
type = string
8686
default = null
8787
description = "Name of existing VPC to use for peering. Required when existing_vpc_id is provided."
88+
}
89+
90+
variable "git_branch" {
91+
type = string
92+
description = "The git branch to use for the Grafana dashboards and Prometheus rules"
93+
default = "main"
8894
}

prometheus_v2/rules/alerts.yml

Lines changed: 22 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -6,31 +6,26 @@ groups:
66
- name: latency
77
rules:
88
- alert: Average Latency Warning
9-
expr: '(irate(endpoint_acc_latency{job="redis", cluster="localhost", db="1"}[1m])) / (irate(endpoint_total_started_res{job="redis", cluster="localhost", db="1"}[1m]))'
9+
# this will depend on your use case and what your expected actual latency will be - in this example we are alerting if the average latency is over 2ms
10+
expr: '(sum by (db)(irate(endpoint_read_requests_latency_histogram_sum[1m]) + irate(endpoint_write_requests_latency_histogram_sum[1m]) + irate(endpoint_other_requests_latency_histogram_sum[1m])))/(sum by (db)(irate(endpoint_read_requests[1m]) + irate(endpoint_write_requests[1m]) + irate(endpoint_other_requests[1m])))/1000 > 2'
1011
for: 30s
1112
labels:
12-
severity: notification
13-
type: latency
14-
job: redis
15-
cluster: localhost
16-
db: 1
13+
severity: notification
14+
type: latency
1715
annotations:
18-
summary: Average Latency Warning
19-
description: "High Latency - Cluster: {{$labels.cluster}} DB: {{$labels.db}} Latency: {{$value}} ms"
20-
runbook: https://redis.io/docs/latest/integrate/prometheus-with-redis-enterprise/observability/#read-latency
16+
summary: Average Latency Warning
17+
description: "High Latency - DB: {{$labels.db}} Latency: {{$value}} ms"
18+
runbook: https://redis.io/docs/latest/integrate/prometheus-with-redis-enterprise/observability/#read-latency
2119
- alert: Average Latency Critical
22-
expr: '(irate(endpoint_acc_latency{job="redis", cluster="localhost", db="1"}[1m])) / (irate(endpoint_total_started_res{job="redis", cluster="localhost", db="1"}[1m]))'
20+
expr: '(sum by (db)(irate(endpoint_read_requests_latency_histogram_sum[1m]) + irate(endpoint_write_requests_latency_histogram_sum[1m]) + irate(endpoint_other_requests_latency_histogram_sum[1m])))/(sum by (db)(irate(endpoint_read_requests[1m]) + irate(endpoint_write_requests[1m]) + irate(endpoint_other_requests[1m])))/1000 > 5'
2321
for: 30s
2422
labels:
25-
severity: critical
26-
type: latency
27-
job: redis
28-
cluster: localhost
29-
db: 1
23+
severity: notification
24+
type: latency
3025
annotations:
31-
summary: Average Latency Critical
32-
description: "High Latency - Cluster: {{$labels.cluster}} DB: {{$labels.db}} Latency: {{$value}} ms"
33-
runbook: https://redis.io/docs/latest/integrate/prometheus-with-redis-enterprise/observability/#read-latency
26+
summary: Average Latency Warning
27+
description: "High Latency - DB: {{$labels.db}} Latency: {{$value}} ms"
28+
runbook: https://redis.io/docs/latest/integrate/prometheus-with-redis-enterprise/observability/#read-latency
3429

3530
- name: connections
3631
rules:
@@ -46,7 +41,7 @@ groups:
4641
description: "No Connections - Cluster: {{$labels.cluster}} DB: {{$labels.db}} Connections: {{$value}}"
4742
runbook: https://redis.io/docs/latest/integrate/prometheus-with-redis-enterprise/observability/#connections
4843
- alert: Excessive Connections
49-
expr: endpoint_client_connections > 64000
44+
expr: endpoint_client_connections - endpoint_client_disconnections > 64000
5045
for: 30s
5146
labels:
5247
severity: critical
@@ -60,7 +55,7 @@ groups:
6055
- name: throughput
6156
rules:
6257
- alert: No Redis Requests
63-
expr: endpoint_read_requests < 1 and endpoint_write_requests < 1
58+
expr: increase(endpoint_read_requests[1m]) < 1 and increase(endpoint_write_requests[1m]) < 1
6459
for: 30s
6560
labels:
6661
severity: critical
@@ -70,7 +65,7 @@ groups:
7065
description: "Too few Redis operations - Cluster: {{$labels.cluster}} DB: {{$labels.db}} {{$value}} (ops/sec)"
7166
runbook: https://redis.io/docs/latest/integrate/prometheus-with-redis-enterprise/observability/#connections
7267
- alert: Excessive Redis Requests
73-
expr: (endpoint_read_requests + endpoint_write_requests)/2 > 1000000
68+
expr: (irate(endpoint_read_requests[1m]) + irate(endpoint_write_requests[1m]))/2 > 100000 # The actual threshold will be based on your Redis Database, adjust as needed
7469
for: 30s
7570
labels:
7671
severity: critical
@@ -83,7 +78,7 @@ groups:
8378
- name: capacity
8479
rules:
8580
- alert: DB full
86-
expr: round((redis_server_used_memory{job="redis", cluster="localhost"}/redis_server_memory_limit{job="redis", cluster="localhost"}) * 100) > 95
81+
expr: ((redis_server_used_memory/redis_server_maxmemory) * 100 ) > 95
8782
for: 30s
8883
labels:
8984
severity: critical
@@ -93,7 +88,7 @@ groups:
9388
description: "DB Usage - Cluster: {{$labels.cluster}} DB: {{$labels.db}} Usage: {{$value}}% full"
9489
runbook: https://redis.io/docs/latest/integrate/prometheus-with-redis-enterprise/observability/#memory
9590
- alert: DB full in 2 hours
96-
expr: round((redis_server_used_memory{job="redis", cluster="localhost"}/redis_server_memory_limit{job="redis", cluster="localhost"}) * 100) < 95 and (predict_linear(redis_server_used_memory{job="redis", cluster="localhost"}[15m], 2 * 3600) / redis_server_memory_limit{job="redis", cluster="localhost"}) > 0.3 and round(predict_linear(redis_server_used_memory{job="redis", cluster="localhost"}[15m], 2 * 3600)/redis_server_memory_limit{job="redis", cluster="localhost"}) > 0.95
91+
expr: round((redis_server_used_memory/redis_server_maxmemory) * 100) < 95 and (predict_linear(redis_server_used_memory[15m], 2 * 3600) / redis_server_maxmemory) > 0.3 and round(predict_linear(redis_server_used_memory[15m], 2 * 3600)/redis_server_maxmemory) > 0.95
9792
for: 30s
9893
labels:
9994
severity: notification
@@ -106,7 +101,7 @@ groups:
106101
- name: utilization
107102
rules:
108103
- alert: Low Hit Ratio
109-
expr: (redis_server_keyspace_read_hits{role="master"}/(redis_server_keyspace_read_hits{role="master"} + redis_server_keyspace_read_misses{role="master"})) < 1
104+
expr: (irate(redis_server_keyspace_read_hits{role="master"}[1m])/irate(redis_server_keyspace_read_misses{role="master"}[1m])) < 1
110105
for: 30s
111106
labels:
112107
severity: notification
@@ -173,7 +168,7 @@ groups:
173168
- name: nodes
174169
rules:
175170
- alert: Node Not Responding
176-
expr: count(node_metrics_up) != 3
171+
expr: count(node_metrics_up) != 3 # will depend on the number of nodes in your Redis cluster
177172
for: 5m
178173
labels:
179174
severity: critical
@@ -193,7 +188,7 @@ groups:
193188
summary: Node Persistent Storage
194189
description: "Low on Persistent Storage - Cluster: {{$labels.cluster}} Space Free: {{$value}}%"
195190
- alert: Node Ephemeral Storage
196-
expr: round((node_ephemeral_storage_free_bytes/node_ephemeral_storage_avail_bytes) * 100) <= 5
191+
expr: node_ephemeral_storage_free_bytes < 5000000000 # Depends on your environment how much ephemeral storage you want to trigger on (5GB in this example)
197192
for: 2m
198193
labels:
199194
severity: critical
@@ -202,7 +197,7 @@ groups:
202197
summary: Node Ephemeral Storage
203198
description: "Low on Ephemeral Storage - Cluster: {{$labels.cluster}} Space Free: {{$value}}%"
204199
- alert: Node Free Memory
205-
expr: round((node_available_memory_bytes/node_memory_MemFree_bytes) * 100) <= 15
200+
expr: node_available_memory_bytes < 5000000000 # Depends on your environment how much memory you want to trigger on (5GB in this example)
206201
for: 2m
207202
labels:
208203
severity: critical

prometheus_v2/rules/capacity-alerts.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ groups:
22
- name: capacity
33
rules:
44
- alert: DB full
5-
expr: round((redis_server_used_memory{job="redis", cluster="localhost"}/redis_server_memory_limit{job="redis", cluster="localhost"}) * 100) > 95
5+
expr: ((redis_server_used_memory/redis_server_maxmemory) * 100 ) > 90
66
for: 30s
77
labels:
88
severity: critical
@@ -12,7 +12,7 @@ groups:
1212
description: "DB Usage - Cluster: {{$labels.cluster}} DB: {{$labels.db}} Usage: {{$value}}% full"
1313
runbook: https://redis.io/docs/latest/integrate/prometheus-with-redis-enterprise/observability/#memory
1414
- alert: DB full in 2 hours
15-
expr: round((redis_server_used_memory{job="redis", cluster="localhost"}/redis_server_memory_limit{job="redis", cluster="localhost"}) * 100) < 95 and (predict_linear(redis_server_used_memory{job="redis", cluster="localhost"}[15m], 2 * 3600) / redis_server_memory_limit{job="redis", cluster="localhost"}) > 0.3 and round(predict_linear(redis_server_used_memory{job="redis", cluster="localhost"}[15m], 2 * 3600)/redis_server_memory_limit{job="redis", cluster="localhost"}) > 0.95
15+
expr: round((redis_server_used_memory/redis_server_maxmemory) * 100) < 95 and (predict_linear(redis_server_used_memory[15m], 2 * 3600) / redis_server_maxmemory) > 0.3 and round(predict_linear(redis_server_used_memory[15m], 2 * 3600)/redis_server_maxmemory) > 0.95
1616
for: 30s
1717
labels:
1818
severity: notification

prometheus_v2/rules/connection-alerts.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ groups:
1313
description: "No Connections - Cluster: {{$labels.cluster}} DB: {{$labels.db}} Connections: {{$value}}"
1414
runbook: https://redis.io/docs/latest/integrate/prometheus-with-redis-enterprise/observability/#connections
1515
- alert: Excessive Connections
16-
expr: endpoint_client_connections > 64000
16+
expr: endpoint_client_connections - endpoint_client_disconnections > 64000
1717
for: 30s
1818
labels:
1919
severity: critical
Lines changed: 23 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,24 @@
11
groups:
2-
- name: latency
3-
rules:
4-
- alert: Average Latency Warning
5-
expr: '(irate(endpoint_acc_latency{job="redis", cluster="localhost", db="1"}[1m])) / (irate(endpoint_total_started_res{job="redis", cluster="localhost", db="1"}[1m]))'
6-
for: 30s
7-
labels:
8-
severity: notification
9-
type: latency
10-
job: redis
11-
cluster: localhost
12-
db: 1
13-
annotations:
14-
summary: Average Latency Warning
15-
description: "High Latency - Cluster: {{$labels.cluster}} DB: {{$labels.db}} Latency: {{$value}} ms"
16-
runbook: https://redis.io/docs/latest/integrate/prometheus-with-redis-enterprise/observability/#read-latency
17-
- alert: Average Latency Critical
18-
expr: '(irate(endpoint_acc_latency{job="redis", cluster="localhost", db="1"}[1m])) / (irate(endpoint_total_started_res{job="redis", cluster="localhost", db="1"}[1m]))'
19-
for: 30s
20-
labels:
21-
severity: critical
22-
type: latency
23-
job: redis
24-
cluster: localhost
25-
db: 1
26-
annotations:
27-
summary: Average Latency Critical
28-
description: "High Latency - Cluster: {{$labels.cluster}} DB: {{$labels.db}} Latency: {{$value}} ms"
29-
runbook: https://redis.io/docs/latest/integrate/prometheus-with-redis-enterprise/observability/#read-latency
2+
- name: latency
3+
rules:
4+
- alert: Average Latency Warning
5+
# this will depend on your use case and what your expected actual latency will be - in this example we are alerting if the average latency is over 2ms
6+
expr: '(sum by (db)(irate(endpoint_read_requests_latency_histogram_sum[1m]) + irate(endpoint_write_requests_latency_histogram_sum[1m]) + irate(endpoint_other_requests_latency_histogram_sum[1m])))/(sum by (db)(irate(endpoint_read_requests[1m]) + irate(endpoint_write_requests[1m]) + irate(endpoint_other_requests[1m])))/1000 > 2'
7+
for: 30s
8+
labels:
9+
severity: notification
10+
type: latency
11+
annotations:
12+
summary: Average Latency Warning
13+
description: "High Latency - DB: {{$labels.db}} Latency: {{$value}} ms"
14+
runbook: https://redis.io/docs/latest/integrate/prometheus-with-redis-enterprise/observability/#read-latency
15+
- alert: Average Latency Critical
16+
expr: '(sum by (db)(irate(endpoint_read_requests_latency_histogram_sum[1m]) + irate(endpoint_write_requests_latency_histogram_sum[1m]) + irate(endpoint_other_requests_latency_histogram_sum[1m])))/(sum by (db)(irate(endpoint_read_requests[1m]) + irate(endpoint_write_requests[1m]) + irate(endpoint_other_requests[1m])))/1000 > 5'
17+
for: 30s
18+
labels:
19+
severity: notification
20+
type: latency
21+
annotations:
22+
summary: Average Latency Warning
23+
description: "High Latency - DB: {{$labels.db}} Latency: {{$value}} ms"
24+
runbook: https://redis.io/docs/latest/integrate/prometheus-with-redis-enterprise/observability/#read-latency

prometheus_v2/rules/throughput-alerts.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ groups:
22
- name: throughput
33
rules:
44
- alert: No Redis Requests
5-
expr: endpoint_read_requests < 1 and endpoint_write_requests < 1
5+
expr: increase(endpoint_read_requests[1m]) < 1 and increase(endpoint_write_requests[1m]) < 1
66
for: 30s
77
labels:
88
severity: critical
@@ -12,7 +12,7 @@ groups:
1212
description: "Too few Redis operations - Cluster: {{$labels.cluster}} DB: {{$labels.db}} {{$value}} (ops/sec)"
1313
runbook: https://redis.io/docs/latest/integrate/prometheus-with-redis-enterprise/observability/#connections
1414
- alert: Excessive Redis Requests
15-
expr: (endpoint_read_requests + endpoint_write_requests)/2 > 1000000
15+
expr: (irate(endpoint_read_requests[1m]) + irate(endpoint_write_requests[1m]))/2 > 100000 # The actual threshold will be based on your Redis Database, adjust as needed
1616
for: 30s
1717
labels:
1818
severity: critical

prometheus_v2/rules/utilization-alerts.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ groups:
22
- name: utilization
33
rules:
44
- alert: Low Hit Ratio
5-
expr: (redis_server_keyspace_read_hits{role="master"}/(redis_server_keyspace_read_hits{role="master"} + redis_server_keyspace_read_misses{role="master"})) < 1
5+
expr: (irate(redis_server_keyspace_read_hits{role="master"}[1m])/irate(redis_server_keyspace_read_misses{role="master"}[1m])) < 1
66
for: 30s
77
labels:
88
severity: notification

0 commit comments

Comments
 (0)