Skip to content

Commit ebe981d

Browse files
authored
Merge pull request #72 from redis-field-engineering/prom/v2-alerts
FIELDENG-817 - updates to v2 alerts
2 parents 62407c4 + d20b884 commit ebe981d

File tree

12 files changed

+104
-72
lines changed

12 files changed

+104
-72
lines changed

grafana_v2/kickstart_v2/docker-compose.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@ services:
2727
- prometheus
2828
extra_hosts:
2929
- "host.docker.internal:host-gateway"
30+
environment:
31+
- GF_SECURITY_ADMIN_PASSWORD=very-secret-password
3032
networks:
3133
kickstart:
3234
ipv4_address: "172.27.2.3"

grafana_v2/kickstart_v2/setup.sh

Lines changed: 28 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -10,14 +10,18 @@ PROJECT="demo"
1010

1111
# this script can take a second argument; the folder from which it should load dashboards
1212
# by default it will use ../dashboards/grafana_v9-11/software/basic
13+
# this script can take a third argument; the grafana password (defaults to 'admin')
14+
15+
# Set default password
16+
password="admin"
1317

1418
if [ $# -eq 0 ]; then
15-
echo "using default endpoint and folder"
19+
echo "using default endpoint, folder, and password"
1620
folder="../dashboards/grafana_v9-11/software/basic/*"
1721
fi
1822

1923
if [ $# -eq 1 ]; then
20-
echo "using endpoint $1 and default folder"
24+
echo "using endpoint $1, default folder, and default password"
2125
folder="../dashboards/grafana_v9-11/software/basic/*"
2226
if [[ "$OSTYPE" == "linux-gnu"* ]]; then # Linux
2327
sed -i "s/host.docker.internal/$1/g" prometheus.yml
@@ -28,8 +32,24 @@ fi
2832

2933
if [ $# -eq 2 ]; then
3034
if [ -d "$2" ]; then
31-
echo "using endpoint $1 and folder $2"
35+
echo "using endpoint $1, folder $2, and default password"
36+
folder="$2/*"
37+
if [[ "$OSTYPE" == "linux-gnu"* ]]; then # Linux
38+
sed -i "s/host.docker.internal/$1/g" prometheus.yml
39+
elif [[ "$OSTYPE" == "darwin"* ]]; then # Mac OSX
40+
sed -i '' "s/host.docker.internal/$1/g" prometheus.yml
41+
fi
42+
else
43+
echo "second argument must be a directory!"
44+
exit 1
45+
fi
46+
fi
47+
48+
if [ $# -eq 3 ]; then
49+
if [ -d "$2" ]; then
50+
echo "using endpoint $1, folder $2, and password $3"
3251
folder="$2/*"
52+
password="$3"
3353
if [[ "$OSTYPE" == "linux-gnu"* ]]; then # Linux
3454
sed -i "s/host.docker.internal/$1/g" prometheus.yml
3555
elif [[ "$OSTYPE" == "darwin"* ]]; then # Mac OSX
@@ -54,7 +74,7 @@ done
5474
# create prometheus datasource
5575
echo ""
5676
echo "create grafana datasource"
57-
curl -s 'http://admin:admin@localhost:3000/api/datasources' \
77+
curl -s "http://admin:$password@localhost:3000/api/datasources" \
5878
--header 'Accept: application/json' \
5979
--header 'Content-Type: application/json' \
6080
--data '{ "name": "prometheus-demo",
@@ -68,13 +88,13 @@ curl -s 'http://admin:admin@localhost:3000/api/datasources' \
6888
echo ""
6989
echo ""
7090
echo "perform datasource health check"
71-
data=`curl -s 'http://admin:admin@localhost:3000/api/datasources/name/prometheus-demo'`
91+
data=`curl -s "http://admin:$password@localhost:3000/api/datasources/name/prometheus-demo"`
7292

7393
str=${data#*\"uid\"\:\"*}
7494
uid=${str%%\"*}
7595

7696
# use the datasource's uid to check its health
77-
data=`curl -s 'http://admin:admin@localhost:3000/api/datasources/uid/'$uid'/health'`
97+
data=`curl -s "http://admin:$password@localhost:3000/api/datasources/uid/$uid/health"`
7898

7999
str=${data#*\"status\"\:\"*}
80100
status=${str%%\"*}
@@ -95,7 +115,7 @@ for file in $folder; do
95115
echo "$file"
96116
d=`cat "$file"`
97117
echo "{ \"dashboard\": $d,\"folderId\": 0, \"message\": \"Created by Redis demo setup script\", \"overwrite\": false}" \
98-
| sed s/\"uid\"\:\ \"\$\{DS_PROMETHEUS\}\"/\"name\"\:\ \"prometheus-demo\"/g | curl -s 'http://admin:admin@localhost:3000/api/dashboards/db' \
118+
| sed s/\"uid\"\:\ \"\$\{DS_PROMETHEUS\}\"/\"name\"\:\ \"prometheus-demo\"/g | curl -s "http://admin:$password@localhost:3000/api/dashboards/db" \
99119
--header 'Accept: application/json' \
100120
--header 'Content-Type: application/json' \
101121
--data-binary @-
@@ -104,7 +124,7 @@ done
104124

105125
echo ""
106126
echo "You can open a browser and access Grafana, and Prometheus at:"
107-
echo " Grafana: http://localhost:3000 (username=admin and password=admin)"
127+
echo " Grafana: http://localhost:3000 (username=admin and password=$password)"
108128
echo " Prometheus: http://localhost:9090"
109129
echo ""
110130
echo ""

grafana_v2/kickstart_v2/terraform/aws/main.tf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -265,7 +265,7 @@ resource "null_resource" "run-kickstart" {
265265
inline = [
266266
"cd redis-enterprise-observability/grafana_v2/kickstart_v2",
267267
"git checkout main",
268-
"./setup.sh ${local.redis_db_primary_fqdn} ../dashboards/grafana_v9-11/cloud/basic",
268+
"./setup.sh ${local.redis_db_primary_fqdn} ../dashboards/grafana_v9-11/cloud/basic ${var.grafana_password}",
269269
]
270270
}
271271
}

grafana_v2/kickstart_v2/terraform/aws/variables.tf

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,4 +85,11 @@ variable "existing_security_group_id" {
8585
type = string
8686
default = null
8787
description = "ID of existing security group to use. Required when existing_vpc_id is provided."
88+
}
89+
90+
variable "grafana_password" {
91+
type = string
92+
description = "The Grafana admin password"
93+
default = "admin"
94+
sensitive = true
8895
}

grafana_v2/kickstart_v2/terraform/gcp/main.tf

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -205,8 +205,8 @@ resource "null_resource" "run-kickstart" {
205205
provisioner "remote-exec" {
206206
inline = [
207207
"cd redis-enterprise-observability/grafana_v2/kickstart_v2",
208-
"git checkout main",
209-
"./setup.sh ${local.redis_db_primary_fqdn} ../dashboards/grafana_v9-11/cloud/basic",
208+
"git checkout ${var.git_branch}",
209+
"./setup.sh ${local.redis_db_primary_fqdn} ../dashboards/grafana_v9-11/cloud/basic ${var.grafana_password}",
210210
]
211211
}
212212
}

grafana_v2/kickstart_v2/terraform/gcp/variables.tf

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,4 +85,17 @@ variable "existing_vpc_name" {
8585
type = string
8686
default = null
8787
description = "Name of existing VPC to use for peering. Required when existing_vpc_id is provided."
88+
}
89+
90+
variable "git_branch" {
91+
type = string
92+
description = "The git branch to use for the Grafana dashboards and Prometheus rules"
93+
default = "main"
94+
}
95+
96+
variable "grafana_password" {
97+
type = string
98+
description = "The Grafana admin password"
99+
default = "admin"
100+
sensitive = true
88101
}

prometheus_v2/rules/alerts.yml

Lines changed: 22 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -6,31 +6,26 @@ groups:
66
- name: latency
77
rules:
88
- alert: Average Latency Warning
9-
expr: '(irate(endpoint_acc_latency{job="redis", cluster="localhost", db="1"}[1m])) / (irate(endpoint_total_started_res{job="redis", cluster="localhost", db="1"}[1m]))'
9+
# this will depend on your use case and what your expected actual latency will be - in this example we are alerting if the average latency is over 2ms
10+
expr: '(sum by (db)(irate(endpoint_read_requests_latency_histogram_sum[1m]) + irate(endpoint_write_requests_latency_histogram_sum[1m]) + irate(endpoint_other_requests_latency_histogram_sum[1m])))/(sum by (db)(irate(endpoint_read_requests[1m]) + irate(endpoint_write_requests[1m]) + irate(endpoint_other_requests[1m])))/1000 > 2'
1011
for: 30s
1112
labels:
12-
severity: notification
13-
type: latency
14-
job: redis
15-
cluster: localhost
16-
db: 1
13+
severity: notification
14+
type: latency
1715
annotations:
18-
summary: Average Latency Warning
19-
description: "High Latency - Cluster: {{$labels.cluster}} DB: {{$labels.db}} Latency: {{$value}} ms"
20-
runbook: https://redis.io/docs/latest/integrate/prometheus-with-redis-enterprise/observability/#read-latency
16+
summary: Average Latency Warning
17+
description: "High Latency - DB: {{$labels.db}} Latency: {{$value}} ms"
18+
runbook: https://redis.io/docs/latest/integrate/prometheus-with-redis-enterprise/observability/#read-latency
2119
- alert: Average Latency Critical
22-
expr: '(irate(endpoint_acc_latency{job="redis", cluster="localhost", db="1"}[1m])) / (irate(endpoint_total_started_res{job="redis", cluster="localhost", db="1"}[1m]))'
20+
expr: '(sum by (db)(irate(endpoint_read_requests_latency_histogram_sum[1m]) + irate(endpoint_write_requests_latency_histogram_sum[1m]) + irate(endpoint_other_requests_latency_histogram_sum[1m])))/(sum by (db)(irate(endpoint_read_requests[1m]) + irate(endpoint_write_requests[1m]) + irate(endpoint_other_requests[1m])))/1000 > 5'
2321
for: 30s
2422
labels:
25-
severity: critical
26-
type: latency
27-
job: redis
28-
cluster: localhost
29-
db: 1
23+
severity: notification
24+
type: latency
3025
annotations:
31-
summary: Average Latency Critical
32-
description: "High Latency - Cluster: {{$labels.cluster}} DB: {{$labels.db}} Latency: {{$value}} ms"
33-
runbook: https://redis.io/docs/latest/integrate/prometheus-with-redis-enterprise/observability/#read-latency
26+
summary: Average Latency Warning
27+
description: "High Latency - DB: {{$labels.db}} Latency: {{$value}} ms"
28+
runbook: https://redis.io/docs/latest/integrate/prometheus-with-redis-enterprise/observability/#read-latency
3429

3530
- name: connections
3631
rules:
@@ -46,7 +41,7 @@ groups:
4641
description: "No Connections - Cluster: {{$labels.cluster}} DB: {{$labels.db}} Connections: {{$value}}"
4742
runbook: https://redis.io/docs/latest/integrate/prometheus-with-redis-enterprise/observability/#connections
4843
- alert: Excessive Connections
49-
expr: endpoint_client_connections > 64000
44+
expr: endpoint_client_connections - endpoint_client_disconnections > 64000
5045
for: 30s
5146
labels:
5247
severity: critical
@@ -60,7 +55,7 @@ groups:
6055
- name: throughput
6156
rules:
6257
- alert: No Redis Requests
63-
expr: endpoint_read_requests < 1 and endpoint_write_requests < 1
58+
expr: increase(endpoint_read_requests[1m]) < 1 and increase(endpoint_write_requests[1m]) < 1
6459
for: 30s
6560
labels:
6661
severity: critical
@@ -70,7 +65,7 @@ groups:
7065
description: "Too few Redis operations - Cluster: {{$labels.cluster}} DB: {{$labels.db}} {{$value}} (ops/sec)"
7166
runbook: https://redis.io/docs/latest/integrate/prometheus-with-redis-enterprise/observability/#connections
7267
- alert: Excessive Redis Requests
73-
expr: (endpoint_read_requests + endpoint_write_requests)/2 > 1000000
68+
expr: (irate(endpoint_read_requests[1m]) + irate(endpoint_write_requests[1m]))/2 > 100000 # The actual threshold will be based on your Redis Database, adjust as needed
7469
for: 30s
7570
labels:
7671
severity: critical
@@ -83,7 +78,7 @@ groups:
8378
- name: capacity
8479
rules:
8580
- alert: DB full
86-
expr: round((redis_server_used_memory{job="redis", cluster="localhost"}/redis_server_memory_limit{job="redis", cluster="localhost"}) * 100) > 95
81+
expr: ((redis_server_used_memory/redis_server_maxmemory) * 100 ) > 95
8782
for: 30s
8883
labels:
8984
severity: critical
@@ -93,7 +88,7 @@ groups:
9388
description: "DB Usage - Cluster: {{$labels.cluster}} DB: {{$labels.db}} Usage: {{$value}}% full"
9489
runbook: https://redis.io/docs/latest/integrate/prometheus-with-redis-enterprise/observability/#memory
9590
- alert: DB full in 2 hours
96-
expr: round((redis_server_used_memory{job="redis", cluster="localhost"}/redis_server_memory_limit{job="redis", cluster="localhost"}) * 100) < 95 and (predict_linear(redis_server_used_memory{job="redis", cluster="localhost"}[15m], 2 * 3600) / redis_server_memory_limit{job="redis", cluster="localhost"}) > 0.3 and round(predict_linear(redis_server_used_memory{job="redis", cluster="localhost"}[15m], 2 * 3600)/redis_server_memory_limit{job="redis", cluster="localhost"}) > 0.95
91+
expr: round((redis_server_used_memory/redis_server_maxmemory) * 100) < 95 and (predict_linear(redis_server_used_memory[15m], 2 * 3600) / redis_server_maxmemory) > 0.3 and round(predict_linear(redis_server_used_memory[15m], 2 * 3600)/redis_server_maxmemory) > 0.95
9792
for: 30s
9893
labels:
9994
severity: notification
@@ -106,7 +101,7 @@ groups:
106101
- name: utilization
107102
rules:
108103
- alert: Low Hit Ratio
109-
expr: (redis_server_keyspace_read_hits{role="master"}/(redis_server_keyspace_read_hits{role="master"} + redis_server_keyspace_read_misses{role="master"})) < 1
104+
expr: (irate(redis_server_keyspace_read_hits{role="master"}[1m])/irate(redis_server_keyspace_read_misses{role="master"}[1m])) < 1
110105
for: 30s
111106
labels:
112107
severity: notification
@@ -173,7 +168,7 @@ groups:
173168
- name: nodes
174169
rules:
175170
- alert: Node Not Responding
176-
expr: count(node_metrics_up) != 3
171+
expr: count(node_metrics_up) != 3 # will depend on the number of nodes in your Redis cluster
177172
for: 5m
178173
labels:
179174
severity: critical
@@ -193,7 +188,7 @@ groups:
193188
summary: Node Persistent Storage
194189
description: "Low on Persistent Storage - Cluster: {{$labels.cluster}} Space Free: {{$value}}%"
195190
- alert: Node Ephemeral Storage
196-
expr: round((node_ephemeral_storage_free_bytes/node_ephemeral_storage_avail_bytes) * 100) <= 5
191+
expr: node_ephemeral_storage_free_bytes < 5000000000 # Depends on your environment how much ephemeral storage you want to trigger on (5GB in this example)
197192
for: 2m
198193
labels:
199194
severity: critical
@@ -202,7 +197,7 @@ groups:
202197
summary: Node Ephemeral Storage
203198
description: "Low on Ephemeral Storage - Cluster: {{$labels.cluster}} Space Free: {{$value}}%"
204199
- alert: Node Free Memory
205-
expr: round((node_available_memory_bytes/node_memory_MemFree_bytes) * 100) <= 15
200+
expr: node_available_memory_bytes < 5000000000 # Depends on your environment how much memory you want to trigger on (5GB in this example)
206201
for: 2m
207202
labels:
208203
severity: critical

prometheus_v2/rules/capacity-alerts.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ groups:
22
- name: capacity
33
rules:
44
- alert: DB full
5-
expr: round((redis_server_used_memory{job="redis", cluster="localhost"}/redis_server_memory_limit{job="redis", cluster="localhost"}) * 100) > 95
5+
expr: ((redis_server_used_memory/redis_server_maxmemory) * 100 ) > 90
66
for: 30s
77
labels:
88
severity: critical
@@ -12,7 +12,7 @@ groups:
1212
description: "DB Usage - Cluster: {{$labels.cluster}} DB: {{$labels.db}} Usage: {{$value}}% full"
1313
runbook: https://redis.io/docs/latest/integrate/prometheus-with-redis-enterprise/observability/#memory
1414
- alert: DB full in 2 hours
15-
expr: round((redis_server_used_memory{job="redis", cluster="localhost"}/redis_server_memory_limit{job="redis", cluster="localhost"}) * 100) < 95 and (predict_linear(redis_server_used_memory{job="redis", cluster="localhost"}[15m], 2 * 3600) / redis_server_memory_limit{job="redis", cluster="localhost"}) > 0.3 and round(predict_linear(redis_server_used_memory{job="redis", cluster="localhost"}[15m], 2 * 3600)/redis_server_memory_limit{job="redis", cluster="localhost"}) > 0.95
15+
expr: round((redis_server_used_memory/redis_server_maxmemory) * 100) < 95 and (predict_linear(redis_server_used_memory[15m], 2 * 3600) / redis_server_maxmemory) > 0.3 and round(predict_linear(redis_server_used_memory[15m], 2 * 3600)/redis_server_maxmemory) > 0.95
1616
for: 30s
1717
labels:
1818
severity: notification

prometheus_v2/rules/connection-alerts.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ groups:
1313
description: "No Connections - Cluster: {{$labels.cluster}} DB: {{$labels.db}} Connections: {{$value}}"
1414
runbook: https://redis.io/docs/latest/integrate/prometheus-with-redis-enterprise/observability/#connections
1515
- alert: Excessive Connections
16-
expr: endpoint_client_connections > 64000
16+
expr: endpoint_client_connections - endpoint_client_disconnections > 64000
1717
for: 30s
1818
labels:
1919
severity: critical
Lines changed: 23 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,24 @@
11
groups:
2-
- name: latency
3-
rules:
4-
- alert: Average Latency Warning
5-
expr: '(irate(endpoint_acc_latency{job="redis", cluster="localhost", db="1"}[1m])) / (irate(endpoint_total_started_res{job="redis", cluster="localhost", db="1"}[1m]))'
6-
for: 30s
7-
labels:
8-
severity: notification
9-
type: latency
10-
job: redis
11-
cluster: localhost
12-
db: 1
13-
annotations:
14-
summary: Average Latency Warning
15-
description: "High Latency - Cluster: {{$labels.cluster}} DB: {{$labels.db}} Latency: {{$value}} ms"
16-
runbook: https://redis.io/docs/latest/integrate/prometheus-with-redis-enterprise/observability/#read-latency
17-
- alert: Average Latency Critical
18-
expr: '(irate(endpoint_acc_latency{job="redis", cluster="localhost", db="1"}[1m])) / (irate(endpoint_total_started_res{job="redis", cluster="localhost", db="1"}[1m]))'
19-
for: 30s
20-
labels:
21-
severity: critical
22-
type: latency
23-
job: redis
24-
cluster: localhost
25-
db: 1
26-
annotations:
27-
summary: Average Latency Critical
28-
description: "High Latency - Cluster: {{$labels.cluster}} DB: {{$labels.db}} Latency: {{$value}} ms"
29-
runbook: https://redis.io/docs/latest/integrate/prometheus-with-redis-enterprise/observability/#read-latency
2+
- name: latency
3+
rules:
4+
- alert: Average Latency Warning
5+
# this will depend on your use case and what your expected actual latency will be - in this example we are alerting if the average latency is over 2ms
6+
expr: '(sum by (db)(irate(endpoint_read_requests_latency_histogram_sum[1m]) + irate(endpoint_write_requests_latency_histogram_sum[1m]) + irate(endpoint_other_requests_latency_histogram_sum[1m])))/(sum by (db)(irate(endpoint_read_requests[1m]) + irate(endpoint_write_requests[1m]) + irate(endpoint_other_requests[1m])))/1000 > 2'
7+
for: 30s
8+
labels:
9+
severity: notification
10+
type: latency
11+
annotations:
12+
summary: Average Latency Warning
13+
description: "High Latency - DB: {{$labels.db}} Latency: {{$value}} ms"
14+
runbook: https://redis.io/docs/latest/integrate/prometheus-with-redis-enterprise/observability/#read-latency
15+
- alert: Average Latency Critical
16+
expr: '(sum by (db)(irate(endpoint_read_requests_latency_histogram_sum[1m]) + irate(endpoint_write_requests_latency_histogram_sum[1m]) + irate(endpoint_other_requests_latency_histogram_sum[1m])))/(sum by (db)(irate(endpoint_read_requests[1m]) + irate(endpoint_write_requests[1m]) + irate(endpoint_other_requests[1m])))/1000 > 5'
17+
for: 30s
18+
labels:
19+
severity: notification
20+
type: latency
21+
annotations:
22+
summary: Average Latency Warning
23+
description: "High Latency - DB: {{$labels.db}} Latency: {{$value}} ms"
24+
runbook: https://redis.io/docs/latest/integrate/prometheus-with-redis-enterprise/observability/#read-latency

0 commit comments

Comments
 (0)