Skip to content

Commit 526425c

Browse files
authored
Merge pull request #29 from redis-field-engineering/prometheus_v2_realign
re-organize alert files and point tests at the appropriate file
2 parents 6888de7 + e7b39c0 commit 526425c

31 files changed

+295
-23
lines changed
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
groups:
2+
- name: capacity
3+
rules:
4+
- alert: DB full
5+
expr: round((redis_server_used_memory{job="redis", cluster="localhost"}/redis_server_memory_limit{job="redis", cluster="localhost"}) * 100) > 95
6+
for: 30s
7+
labels:
8+
severity: critical
9+
type: capacity
10+
annotations:
11+
summary: DB is full
12+
description: "DB Usage - Cluster: {{$labels.cluster}} DB: {{$labels.db}} Usage: {{$value}}% full"
13+
runbook: https://redis.io/docs/latest/integrate/prometheus-with-redis-enterprise/observability/#memory
14+
- alert: DB full in 2 hours
15+
expr: round((redis_server_used_memory{job="redis", cluster="localhost"}/redis_server_memory_limit{job="redis", cluster="localhost"}) * 100) < 95 and (predict_linear(redis_server_used_memory{job="redis", cluster="localhost"}[15m], 2 * 3600) / redis_server_memory_limit{job="redis", cluster="localhost"}) > 0.3 and round(predict_linear(redis_server_used_memory{job="redis", cluster="localhost"}[15m], 2 * 3600)/redis_server_memory_limit{job="redis", cluster="localhost"}) > 0.95
16+
for: 30s
17+
labels:
18+
severity: notification
19+
type: capacity
20+
annotations:
21+
summary: DB will be full in two hours
22+
description: "DB Usage - Cluster: {{$labels.cluster}} DB: {{$labels.db}} Usage: {{$value}}% in 2 hours"
23+
runbook: https://redis.io/docs/latest/integrate/prometheus-with-redis-enterprise/observability/#memory
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
groups:
2+
- name: connections
3+
rules:
4+
- alert: No Redis Connections
5+
expr: endpoint_client_connections < 1
6+
for: 30s
7+
labels:
8+
severity: notification
9+
job: redis
10+
type: connection
11+
annotations:
12+
summary: "No Redis Connections"
13+
description: "No Connections - Cluster: {{$labels.cluster}} DB: {{$labels.db}} Connections: {{$value}}"
14+
runbook: https://redis.io/docs/latest/integrate/prometheus-with-redis-enterprise/observability/#connections
15+
- alert: Excessive Connections
16+
expr: endpoint_client_connections > 64000
17+
for: 30s
18+
labels:
19+
severity: critical
20+
job: redis
21+
type: connection
22+
annotations:
23+
summary: "Too many connections"
24+
description: "Too Many Connections - Cluster: {{$labels.cluster}} DB: {{$labels.db}} Connections: {{$value}}"
25+
runbook: https://redis.io/docs/latest/integrate/prometheus-with-redis-enterprise/observability/#connections
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
groups:
2+
- name: latency
3+
rules:
4+
- alert: Average Latency Warning
5+
expr: '(irate(endpoint_acc_latency{job="redis", cluster="localhost", db="1"}[1m])) / (irate(endpoint_total_started_res{job="redis", cluster="localhost", db="1"}[1m]))'
6+
for: 30s
7+
labels:
8+
severity: notification
9+
type: latency
10+
job: redis
11+
cluster: localhost
12+
db: 1
13+
annotations:
14+
summary: Average Latency Warning
15+
description: "High Latency - Cluster: {{$labels.cluster}} DB: {{$labels.db}} Latency: {{$value}} ms"
16+
runbook: https://redis.io/docs/latest/integrate/prometheus-with-redis-enterprise/observability/#read-latency
17+
- alert: Average Latency Critical
18+
expr: '(irate(endpoint_acc_latency{job="redis", cluster="localhost", db="1"}[1m])) / (irate(endpoint_total_started_res{job="redis", cluster="localhost", db="1"}[1m]))'
19+
for: 30s
20+
labels:
21+
severity: critical
22+
type: latency
23+
job: redis
24+
cluster: localhost
25+
db: 1
26+
annotations:
27+
summary: Average Latency Critical
28+
description: "High Latency - Cluster: {{$labels.cluster}} DB: {{$labels.db}} Latency: {{$value}} ms"
29+
runbook: https://redis.io/docs/latest/integrate/prometheus-with-redis-enterprise/observability/#read-latency
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
groups:
2+
- name: nodes
3+
rules:
4+
- alert: Node Not Responding
5+
expr: count(node_metrics_up) != 3
6+
for: 5m
7+
labels:
8+
severity: critical
9+
job: redis
10+
type: node
11+
annotations:
12+
summary: Node Not Responding
13+
description: "Node Down - Cluster: {{$labels.cluster}} Expected Nodes: 3 Actual: {{$value}}"
14+
- alert: Node Persistent Storage
15+
expr: round((node_persistent_storage_free_bytes/node_persistent_storage_avail_bytes) * 100) <= 5
16+
for: 2m
17+
labels:
18+
severity: critical
19+
job: redis
20+
type: node
21+
annotations:
22+
summary: Node Persistent Storage
23+
description: "Low on Persistent Storage - Cluster: {{$labels.cluster}} Space Free: {{$value}}%"
24+
- alert: Node Ephemeral Storage
25+
expr: round((node_ephemeral_storage_free_bytes/node_ephemeral_storage_avail_bytes) * 100) <= 5
26+
for: 2m
27+
labels:
28+
severity: critical
29+
type: node
30+
annotations:
31+
summary: Node Ephemeral Storage
32+
description: "Low on Ephemeral Storage - Cluster: {{$labels.cluster}} Space Free: {{$value}}%"
33+
- alert: Node Free Memory
34+
expr: round((node_available_memory_bytes/node_memory_MemFree_bytes) * 100) <= 15
35+
for: 2m
36+
labels:
37+
severity: critical
38+
type: node
39+
annotations:
40+
summary: Node Free Memory
41+
description: "Low on Memory - Cluster: {{$labels.cluster}} Memory Free: {{$value}}%"
42+
runbook: https://redis.io/docs/latest/integrate/prometheus-with-redis-enterprise/observability/#memory
43+
- alert: Node CPU Usage
44+
expr: rate(namedprocess_namegroup_cpu_seconds_total{job="redis", cluster="localhost", node="1"}[5m]) >= 0
45+
for: 5m
46+
labels:
47+
severity: critical
48+
type: node
49+
annotations:
50+
summary: Node CPU Usage
51+
description: "High CPU Usage - Cluster: {{$labels.cluster}} Usage: {{$value}}%"
52+
runbook: https://redis.io/docs/latest/integrate/prometheus-with-redis-enterprise/observability/#cpu
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
groups:
2+
- name: shards
3+
rules:
4+
- alert: Shard Down
5+
expr: redis_server_up == 0
6+
for: 1m
7+
labels:
8+
severity: page
9+
type: shard
10+
annotations:
11+
summary: Redis Shard instance is down
12+
description: Redis Shard instance {{$labels.cluster}} is down
13+
- alert: Master Shard Down
14+
expr: floor(redis_server_master_link_status{role="slave"}) < 1
15+
for: 1m
16+
labels:
17+
severity: notification
18+
type: shard
19+
annotations:
20+
summary: Master Shard Down
21+
description: "Slave has no master - Cluster: {{$labels.cluster}} Shard: {{$labels.redis}} Node: {{$labels.node}}"
22+
runbook: https://redis.io/docs/latest/integrate/prometheus-with-redis-enterprise/observability/#cpu
23+
- alert: Shard CPU Usage
24+
expr: rate(namedprocess_namegroup_cpu_seconds_total{job="redis", cluster="localhost", node="1", redis="1"}[5m]) >= 0
25+
for: 5m
26+
labels:
27+
severity: notification
28+
type: shard
29+
annotations:
30+
summary: Shard CPU Usage
31+
description: "Busy Shard - Cluster: {{$labels.cluster}} Shard: {{$labels.redis}} Node: {{$labels.node}} CPU: {{$value}}%"
32+
runbook: https://redis.io/docs/latest/integrate/prometheus-with-redis-enterprise/observability/#cpu
33+
- alert: Hot Master Shard
34+
expr: rate(namedprocess_namegroup_cpu_seconds_total{job="redis", cluster="localhost", role="master", node="1", redis="1"}[5m]) >= 0
35+
for: 1m
36+
labels:
37+
severity: critical
38+
type: shard
39+
role: master
40+
annotations:
41+
summary: Hot Master Shard
42+
description: "Hot Shard - Cluster: {{$labels.cluster}} Shard: {{$labels.redis}} Node: {{$labels.node}} CPU: {{$value}}%"
43+
runbook: https://redis.io/docs/latest/integrate/prometheus-with-redis-enterprise/observability/#cpu
44+
- alert: Proxy CPU Usage
45+
expr: rate(namedprocess_namegroup_cpu_seconds_total{job="redis", cluster="localhost", groupname="dmcproxy"}[5m]) > 0.0
46+
for: 5m
47+
labels:
48+
severity: critical
49+
type: proxy
50+
annotations:
51+
summary: Proxy CPU Usage
52+
description: "Proxy CPU usage has exceeded {{$value}}% for more than 60 seconds"
53+
runbook: https://redis.io/docs/latest/integrate/prometheus-with-redis-enterprise/observability/#cpu
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
groups:
2+
- name: synchronization
3+
rules:
4+
- alert: ReplicaSync Status
5+
expr: database_syncer_current_status{syncer_type="replicaof"} > 0
6+
for: 5m
7+
labels:
8+
severity: warning
9+
type: synchronization
10+
annotations:
11+
summary: Replication - Resynchronization Requests (Status)
12+
description: Replication on {{$labels.cluster}} not synchronized in 10 minutes
13+
runbook: https://redis.io/docs/latest/integrate/prometheus-with-redis-enterprise/observability/#synchronization
14+
- alert: CRDTSync Status
15+
expr: database_syncer_current_status{syncer_type="crdt"} > 0
16+
for: 5m
17+
labels:
18+
severity: warning
19+
type: synchronization
20+
annotations:
21+
summary: Replication - Unsynchronized (CRDT)
22+
description: CRDT Replication on {{$labels.cluster}} not synchronized for 5 minutes
23+
runbook: https://redis.io/docs/latest/integrate/prometheus-with-redis-enterprise/observability/#synchronization
24+
- alert: ReplicaLag Status
25+
expr: database_syncer_lag_ms{syncer_type="replicaof"} > 500
26+
for: 5m
27+
labels:
28+
severity: warning
29+
type: synchronization
30+
annotations:
31+
summary: Replication - High Latency (Status)
32+
description: Replication Latency {{$labels.cluster}} exceeded 500ms for 10 minutes
33+
runbook: https://redis.io/docs/latest/integrate/prometheus-with-redis-enterprise/observability/#synchronization
34+
- alert: CRDTLag Status
35+
expr: database_syncer_lag_ms{syncer_type="crdt", job="redis", cluster="localhost"} > 500
36+
for: 5m
37+
labels:
38+
severity: warning
39+
type: synchronization
40+
annotations:
41+
summary: Replication - High Latency (CRDT)
42+
description: CRDT Replication Latency on {{$labels.cluster}} exceeded 500ms for 10 minutes
43+
runbook: https://redis.io/docs/latest/integrate/prometheus-with-redis-enterprise/observability/#synchronization
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
groups:
2+
- name: throughput
3+
rules:
4+
- alert: No Redis Requests
5+
expr: endpoint_read_requests < 1 and endpoint_write_requests < 1
6+
for: 30s
7+
labels:
8+
severity: critical
9+
type: throughput
10+
annotations:
11+
summary: No Redis Requests
12+
description: "Too few Redis operations - Cluster: {{$labels.cluster}} DB: {{$labels.db}} {{$value}} (ops/sec)"
13+
runbook: https://redis.io/docs/latest/integrate/prometheus-with-redis-enterprise/observability/#connections
14+
- alert: Excessive Redis Requests
15+
expr: (endpoint_read_requests + endpoint_write_requests)/2 > 1000000
16+
for: 30s
17+
labels:
18+
severity: critical
19+
type: throughput
20+
annotations:
21+
summary: Excessive Redis Requests
22+
description: "Too Many Redis Operations - Cluster: {{$labels.cluster}} DB: {{$labels.db}} {{$value}} (ops/sec)"
23+
runbook: https://redis.io/docs/latest/integrate/prometheus-with-redis-enterprise/observability/#connections
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
groups:
2+
- name: utilization
3+
rules:
4+
- alert: Low Hit Ratio
5+
expr: (redis_server_keyspace_read_hits{role="master"}/(redis_server_keyspace_read_hits{role="master"} + redis_server_keyspace_read_misses{role="master"})) < 1
6+
for: 30s
7+
labels:
8+
severity: notification
9+
type: utilization
10+
annotations:
11+
summary: Low Hit Ratio
12+
description: "Low Hit Ratio: {{$labels.cluster}} DB: {{$labels.db}} Hit Ratio: {{$value}}%"
13+
runbook: https://redis.io/docs/latest/integrate/prometheus-with-redis-enterprise/observability/#cache-hit-ratio-and-eviction
14+
- alert: Unexpected Object Eviction
15+
expr: redis_server_evicted_keys{role="master"} > 1
16+
for: 3m
17+
labels:
18+
job: redis
19+
severity: critical
20+
type: utilization
21+
annotations:
22+
summary: Unexpected Object Eviction
23+
description: "Evictions Occurring: {{$labels.cluster}} DB: {{$labels.db}} EvictionsPerSecond: {{$value}}"
24+
runbook: https://redis.io/docs/latest/integrate/prometheus-with-redis-enterprise/observability/#cache-hit-ratio-and-eviction

prometheus_v2/tests/crdt_lag.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
# Only this file is passed as command line argument.
33

44
rule_files:
5-
- ../rules/alerts.yml
5+
- ../rules/synchronization-alerts.yml
66

77
evaluation_interval: 1m
88

prometheus_v2/tests/crdt_sync.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
# Only this file is passed as command line argument.
33

44
rule_files:
5-
- ../rules/alerts.yml
5+
- ../rules/synchronization-alerts.yml
66

77
evaluation_interval: 1m
88

0 commit comments

Comments
 (0)