Merge pull request #72 from redis-field-engineering/prom/v2-alerts

slorello89 · web-flow · commit ebe981df6caa · 2025-09-11T11:35:44.000-04:00
FIELDENG-817 - updates to v2 alerts
diff --git a/grafana_v2/kickstart_v2/docker-compose.yml b/grafana_v2/kickstart_v2/docker-compose.yml
@@ -27,6 +27,8 @@ services:
       - prometheus
     extra_hosts:
       - "host.docker.internal:host-gateway"
+    environment:
+      - GF_SECURITY_ADMIN_PASSWORD=very-secret-password
     networks:
       kickstart:
         ipv4_address: "172.27.2.3"
diff --git a/grafana_v2/kickstart_v2/setup.sh b/grafana_v2/kickstart_v2/setup.sh
@@ -10,14 +10,18 @@ PROJECT="demo"
 
 # this script can take a second argument; the folder from which it should load dashboards
 # by default it will use ../dashboards/grafana_v9-11/software/basic
+# this script can take a third argument; the grafana password (defaults to 'admin')
+
+# Set default password
+password="admin"
 
 if [ $# -eq 0 ]; then
-  echo "using default endpoint and folder"
+  echo "using default endpoint, folder, and password"
   folder="../dashboards/grafana_v9-11/software/basic/*"
 fi
 
 if [ $# -eq 1 ]; then
-  echo "using endpoint $1 and default folder"
+  echo "using endpoint $1, default folder, and default password"
   folder="../dashboards/grafana_v9-11/software/basic/*"
     if [[ "$OSTYPE" == "linux-gnu"* ]]; then # Linux
       sed -i "s/host.docker.internal/$1/g" prometheus.yml
@@ -28,8 +32,24 @@ fi
 
 if [ $# -eq 2 ]; then
   if [ -d "$2" ]; then
-    echo "using endpoint $1 and folder $2"
+    echo "using endpoint $1, folder $2, and default password"
+    folder="$2/*"
+    if [[ "$OSTYPE" == "linux-gnu"* ]]; then # Linux
+      sed -i "s/host.docker.internal/$1/g" prometheus.yml
+    elif [[ "$OSTYPE" == "darwin"* ]]; then # Mac OSX
+      sed -i '' "s/host.docker.internal/$1/g" prometheus.yml
+    fi
+  else
+    echo "second argument must be a directory!"
+    exit 1
+  fi
+fi
+
+if [ $# -eq 3 ]; then
+  if [ -d "$2" ]; then
+    echo "using endpoint $1, folder $2, and password $3"
     folder="$2/*"
+    password="$3"
     if [[ "$OSTYPE" == "linux-gnu"* ]]; then # Linux
       sed -i "s/host.docker.internal/$1/g" prometheus.yml
     elif [[ "$OSTYPE" == "darwin"* ]]; then # Mac OSX
@@ -54,7 +74,7 @@ done
 # create prometheus datasource
 echo ""
 echo "create grafana datasource"
-curl -s 'http://admin:admin@localhost:3000/api/datasources' \
+curl -s "http://admin:$password@localhost:3000/api/datasources" \
 --header 'Accept: application/json' \
 --header 'Content-Type: application/json' \
 --data '{   "name": "prometheus-demo",
@@ -68,13 +88,13 @@ curl -s 'http://admin:admin@localhost:3000/api/datasources' \
 echo ""
 echo ""
 echo "perform datasource health check"
-data=`curl -s 'http://admin:admin@localhost:3000/api/datasources/name/prometheus-demo'`
+data=`curl -s "http://admin:$password@localhost:3000/api/datasources/name/prometheus-demo"`
 
 str=${data#*\"uid\"\:\"*}
 uid=${str%%\"*}
 
 # use the datasource's uid to check its health
-data=`curl -s 'http://admin:admin@localhost:3000/api/datasources/uid/'$uid'/health'`
+data=`curl -s "http://admin:$password@localhost:3000/api/datasources/uid/$uid/health"`
 
 str=${data#*\"status\"\:\"*}
 status=${str%%\"*}
@@ -95,7 +115,7 @@ for file in $folder; do
         echo "$file"
         d=`cat "$file"`
         echo "{  \"dashboard\": $d,\"folderId\": 0,  \"message\": \"Created by Redis demo setup script\",  \"overwrite\": false}" \
-        | sed s/\"uid\"\:\ \"\$\{DS_PROMETHEUS\}\"/\"name\"\:\ \"prometheus-demo\"/g | curl -s 'http://admin:admin@localhost:3000/api/dashboards/db' \
+        | sed s/\"uid\"\:\ \"\$\{DS_PROMETHEUS\}\"/\"name\"\:\ \"prometheus-demo\"/g | curl -s "http://admin:$password@localhost:3000/api/dashboards/db" \
                --header 'Accept: application/json' \
                --header 'Content-Type: application/json' \
                --data-binary @-
@@ -104,7 +124,7 @@ done
 
 echo ""
 echo "You can open a browser and access Grafana, and Prometheus at:"
-echo "  Grafana: http://localhost:3000 (username=admin and password=admin)"
+echo "  Grafana: http://localhost:3000 (username=admin and password=$password)"
 echo "  Prometheus: http://localhost:9090"
 echo ""
 echo ""
diff --git a/grafana_v2/kickstart_v2/terraform/aws/main.tf b/grafana_v2/kickstart_v2/terraform/aws/main.tf
@@ -265,7 +265,7 @@ resource "null_resource" "run-kickstart" {
         inline = [
             "cd redis-enterprise-observability/grafana_v2/kickstart_v2",
             "git checkout main",
-            "./setup.sh ${local.redis_db_primary_fqdn} ../dashboards/grafana_v9-11/cloud/basic",
+            "./setup.sh ${local.redis_db_primary_fqdn} ../dashboards/grafana_v9-11/cloud/basic ${var.grafana_password}",
         ]
     }
 }
diff --git a/grafana_v2/kickstart_v2/terraform/aws/variables.tf b/grafana_v2/kickstart_v2/terraform/aws/variables.tf
@@ -85,4 +85,11 @@ variable "existing_security_group_id" {
   type        = string
   default     = null
   description = "ID of existing security group to use. Required when existing_vpc_id is provided."
+}
+
+variable "grafana_password" {
+  type        = string
+  description = "The Grafana admin password"
+  default     = "admin"
+  sensitive   = true
 }
diff --git a/grafana_v2/kickstart_v2/terraform/gcp/main.tf b/grafana_v2/kickstart_v2/terraform/gcp/main.tf
@@ -205,8 +205,8 @@ resource "null_resource" "run-kickstart" {
     provisioner "remote-exec" {
             inline = [
                 "cd redis-enterprise-observability/grafana_v2/kickstart_v2",
-                "git checkout main",
-                "./setup.sh ${local.redis_db_primary_fqdn} ../dashboards/grafana_v9-11/cloud/basic",
+                "git checkout ${var.git_branch}",
+                "./setup.sh ${local.redis_db_primary_fqdn} ../dashboards/grafana_v9-11/cloud/basic ${var.grafana_password}",
             ]
         }
 }
diff --git a/grafana_v2/kickstart_v2/terraform/gcp/variables.tf b/grafana_v2/kickstart_v2/terraform/gcp/variables.tf
@@ -85,4 +85,17 @@ variable "existing_vpc_name" {
   type        = string
   default     = null
   description = "Name of existing VPC to use for peering. Required when existing_vpc_id is provided."
+}
+
+variable "git_branch" {
+    type      = string
+    description = "The git branch to use for the Grafana dashboards and Prometheus rules"
+    default   = "main"
+}
+
+variable "grafana_password" {
+    type        = string
+    description = "The Grafana admin password"
+    default     = "admin"
+    sensitive   = true
 }
diff --git a/prometheus_v2/rules/alerts.yml b/prometheus_v2/rules/alerts.yml
@@ -6,31 +6,26 @@ groups:
  - name: latency
    rules:
    - alert: Average Latency Warning
-     expr: '(irate(endpoint_acc_latency{job="redis", cluster="localhost", db="1"}[1m])) / (irate(endpoint_total_started_res{job="redis", cluster="localhost", db="1"}[1m]))'
+    # this will depend on your use case and what your expected actual latency will be - in this example we are alerting if the average latency is over 2ms
+     expr: '(sum by (db)(irate(endpoint_read_requests_latency_histogram_sum[1m]) + irate(endpoint_write_requests_latency_histogram_sum[1m]) + irate(endpoint_other_requests_latency_histogram_sum[1m])))/(sum by (db)(irate(endpoint_read_requests[1m]) + irate(endpoint_write_requests[1m]) + irate(endpoint_other_requests[1m])))/1000 > 2' 
      for: 30s
      labels:
-       severity: notification
-       type: latency
-       job: redis
-       cluster: localhost
-       db: 1
+      severity: notification
+      type: latency
      annotations:
-       summary: Average Latency Warning
-       description: "High Latency - Cluster: {{$labels.cluster}} DB: {{$labels.db}} Latency: {{$value}} ms"
-       runbook: https://redis.io/docs/latest/integrate/prometheus-with-redis-enterprise/observability/#read-latency
+      summary: Average Latency Warning
+      description: "High Latency - DB: {{$labels.db}} Latency: {{$value}} ms"
+      runbook: https://redis.io/docs/latest/integrate/prometheus-with-redis-enterprise/observability/#read-latency
    - alert: Average Latency Critical
-     expr: '(irate(endpoint_acc_latency{job="redis", cluster="localhost", db="1"}[1m])) / (irate(endpoint_total_started_res{job="redis", cluster="localhost", db="1"}[1m]))'
+     expr: '(sum by (db)(irate(endpoint_read_requests_latency_histogram_sum[1m]) + irate(endpoint_write_requests_latency_histogram_sum[1m]) + irate(endpoint_other_requests_latency_histogram_sum[1m])))/(sum by (db)(irate(endpoint_read_requests[1m]) + irate(endpoint_write_requests[1m]) + irate(endpoint_other_requests[1m])))/1000 > 5' 
      for: 30s
      labels:
-       severity: critical
-       type: latency
-       job: redis
-       cluster: localhost
-       db: 1
+      severity: notification
+      type: latency
      annotations:
-       summary: Average Latency Critical
-       description: "High Latency - Cluster: {{$labels.cluster}} DB: {{$labels.db}} Latency: {{$value}} ms"
-       runbook: https://redis.io/docs/latest/integrate/prometheus-with-redis-enterprise/observability/#read-latency
+      summary: Average Latency Warning
+      description: "High Latency - DB: {{$labels.db}} Latency: {{$value}} ms"
+      runbook: https://redis.io/docs/latest/integrate/prometheus-with-redis-enterprise/observability/#read-latency
 
  - name: connections
    rules:
@@ -46,7 +41,7 @@ groups:
        description: "No Connections - Cluster: {{$labels.cluster}} DB: {{$labels.db}} Connections: {{$value}}"
        runbook: https://redis.io/docs/latest/integrate/prometheus-with-redis-enterprise/observability/#connections
    - alert: Excessive Connections
-     expr: endpoint_client_connections > 64000
+     expr: endpoint_client_connections - endpoint_client_disconnections > 64000
      for: 30s
      labels:
        severity: critical
@@ -60,7 +55,7 @@ groups:
  - name: throughput
    rules:
    - alert: No Redis Requests
-     expr: endpoint_read_requests < 1 and endpoint_write_requests < 1
+     expr: increase(endpoint_read_requests[1m]) < 1 and increase(endpoint_write_requests[1m]) < 1
      for: 30s
      labels:
        severity: critical
@@ -70,7 +65,7 @@ groups:
        description: "Too few Redis operations - Cluster: {{$labels.cluster}} DB: {{$labels.db}}  {{$value}} (ops/sec)"
        runbook: https://redis.io/docs/latest/integrate/prometheus-with-redis-enterprise/observability/#connections
    - alert: Excessive Redis Requests
-     expr: (endpoint_read_requests + endpoint_write_requests)/2 > 1000000
+     expr: (irate(endpoint_read_requests[1m]) + irate(endpoint_write_requests[1m]))/2 > 100000 # The actual threshold will be based on your Redis Database, adjust as needed 
      for: 30s
      labels:
        severity: critical
@@ -83,7 +78,7 @@ groups:
  - name: capacity
    rules:
    - alert: DB full
-     expr: round((redis_server_used_memory{job="redis", cluster="localhost"}/redis_server_memory_limit{job="redis", cluster="localhost"}) * 100) > 95
+     expr: ((redis_server_used_memory/redis_server_maxmemory) * 100 ) > 95
      for: 30s
      labels:
        severity: critical
@@ -93,7 +88,7 @@ groups:
        description: "DB Usage - Cluster: {{$labels.cluster}} DB: {{$labels.db}} Usage: {{$value}}% full"
        runbook: https://redis.io/docs/latest/integrate/prometheus-with-redis-enterprise/observability/#memory
    - alert: DB full in 2 hours
-     expr: round((redis_server_used_memory{job="redis", cluster="localhost"}/redis_server_memory_limit{job="redis", cluster="localhost"}) * 100) < 95 and (predict_linear(redis_server_used_memory{job="redis", cluster="localhost"}[15m], 2 * 3600) / redis_server_memory_limit{job="redis", cluster="localhost"}) > 0.3 and round(predict_linear(redis_server_used_memory{job="redis", cluster="localhost"}[15m], 2 * 3600)/redis_server_memory_limit{job="redis", cluster="localhost"}) > 0.95
+     expr: round((redis_server_used_memory/redis_server_maxmemory) * 100) < 95 and (predict_linear(redis_server_used_memory[15m], 2 * 3600) / redis_server_maxmemory) > 0.3 and round(predict_linear(redis_server_used_memory[15m], 2 * 3600)/redis_server_maxmemory) > 0.95
      for: 30s
      labels:
        severity: notification
@@ -106,7 +101,7 @@ groups:
  - name: utilization
    rules:
    - alert: Low Hit Ratio
-     expr: (redis_server_keyspace_read_hits{role="master"}/(redis_server_keyspace_read_hits{role="master"} + redis_server_keyspace_read_misses{role="master"})) < 1
+     expr: (irate(redis_server_keyspace_read_hits{role="master"}[1m])/irate(redis_server_keyspace_read_misses{role="master"}[1m])) < 1
      for: 30s
      labels:
        severity: notification
@@ -173,7 +168,7 @@ groups:
  - name: nodes
    rules:
    - alert: Node Not Responding
-     expr: count(node_metrics_up) != 3
+     expr: count(node_metrics_up) != 3 # will depend on the number of nodes in your Redis cluster
      for: 5m
      labels:
        severity: critical
@@ -193,7 +188,7 @@ groups:
        summary: Node Persistent Storage
        description: "Low on Persistent Storage - Cluster: {{$labels.cluster}} Space Free: {{$value}}%"
    - alert: Node Ephemeral Storage
-     expr: round((node_ephemeral_storage_free_bytes/node_ephemeral_storage_avail_bytes) * 100) <= 5
+     expr: node_ephemeral_storage_free_bytes < 5000000000 # Depends on your environment how much ephemeral storage you want to trigger on (5GB in this example)
      for: 2m
      labels:
        severity: critical
@@ -202,7 +197,7 @@ groups:
        summary: Node Ephemeral Storage
        description: "Low on Ephemeral Storage - Cluster: {{$labels.cluster}} Space Free: {{$value}}%"
    - alert: Node Free Memory
-     expr: round((node_available_memory_bytes/node_memory_MemFree_bytes) * 100) <= 15
+     expr: node_available_memory_bytes < 5000000000 # Depends on your environment how much memory you want to trigger on (5GB in this example)
      for: 2m
      labels:
        severity: critical
diff --git a/prometheus_v2/rules/capacity-alerts.yml b/prometheus_v2/rules/capacity-alerts.yml
@@ -2,7 +2,7 @@ groups:
   - name: capacity
     rules:
       - alert: DB full
-        expr: round((redis_server_used_memory{job="redis", cluster="localhost"}/redis_server_memory_limit{job="redis", cluster="localhost"}) * 100) > 95
+        expr: ((redis_server_used_memory/redis_server_maxmemory) * 100 ) > 90
         for: 30s
         labels:
           severity: critical
@@ -12,7 +12,7 @@ groups:
           description: "DB Usage - Cluster: {{$labels.cluster}} DB: {{$labels.db}} Usage: {{$value}}% full"
           runbook: https://redis.io/docs/latest/integrate/prometheus-with-redis-enterprise/observability/#memory
       - alert: DB full in 2 hours
-        expr: round((redis_server_used_memory{job="redis", cluster="localhost"}/redis_server_memory_limit{job="redis", cluster="localhost"}) * 100) < 95 and (predict_linear(redis_server_used_memory{job="redis", cluster="localhost"}[15m], 2 * 3600) / redis_server_memory_limit{job="redis", cluster="localhost"}) > 0.3 and round(predict_linear(redis_server_used_memory{job="redis", cluster="localhost"}[15m], 2 * 3600)/redis_server_memory_limit{job="redis", cluster="localhost"}) > 0.95
+        expr: round((redis_server_used_memory/redis_server_maxmemory) * 100) < 95 and (predict_linear(redis_server_used_memory[15m], 2 * 3600) / redis_server_maxmemory) > 0.3 and round(predict_linear(redis_server_used_memory[15m], 2 * 3600)/redis_server_maxmemory) > 0.95
         for: 30s
         labels:
           severity: notification
diff --git a/prometheus_v2/rules/connection-alerts.yml b/prometheus_v2/rules/connection-alerts.yml
@@ -13,7 +13,7 @@ groups:
           description: "No Connections - Cluster: {{$labels.cluster}} DB: {{$labels.db}} Connections: {{$value}}"
           runbook: https://redis.io/docs/latest/integrate/prometheus-with-redis-enterprise/observability/#connections
       - alert: Excessive Connections
-        expr: endpoint_client_connections > 64000
+        expr: endpoint_client_connections - endpoint_client_disconnections > 64000
         for: 30s
         labels:
           severity: critical
diff --git a/prometheus_v2/rules/latency-alerts.yml b/prometheus_v2/rules/latency-alerts.yml
@@ -1,29 +1,24 @@
 groups:
- - name: latency
-   rules:
-   - alert: Average Latency Warning
-     expr: '(irate(endpoint_acc_latency{job="redis", cluster="localhost", db="1"}[1m])) / (irate(endpoint_total_started_res{job="redis", cluster="localhost", db="1"}[1m]))'
-     for: 30s
-     labels:
-       severity: notification
-       type: latency
-       job: redis
-       cluster: localhost
-       db: 1
-     annotations:
-       summary: Average Latency Warning
-       description: "High Latency - Cluster: {{$labels.cluster}} DB: {{$labels.db}} Latency: {{$value}} ms"
-       runbook: https://redis.io/docs/latest/integrate/prometheus-with-redis-enterprise/observability/#read-latency
-   - alert: Average Latency Critical
-     expr: '(irate(endpoint_acc_latency{job="redis", cluster="localhost", db="1"}[1m])) / (irate(endpoint_total_started_res{job="redis", cluster="localhost", db="1"}[1m]))'
-     for: 30s
-     labels:
-       severity: critical
-       type: latency
-       job: redis
-       cluster: localhost
-       db: 1
-     annotations:
-       summary: Average Latency Critical
-       description: "High Latency - Cluster: {{$labels.cluster}} DB: {{$labels.db}} Latency: {{$value}} ms"
-       runbook: https://redis.io/docs/latest/integrate/prometheus-with-redis-enterprise/observability/#read-latency
+  - name: latency
+    rules:
+      - alert: Average Latency Warning
+        # this will depend on your use case and what your expected actual latency will be - in this example we are alerting if the average latency is over 2ms
+        expr: '(sum by (db)(irate(endpoint_read_requests_latency_histogram_sum[1m]) + irate(endpoint_write_requests_latency_histogram_sum[1m]) + irate(endpoint_other_requests_latency_histogram_sum[1m])))/(sum by (db)(irate(endpoint_read_requests[1m]) + irate(endpoint_write_requests[1m]) + irate(endpoint_other_requests[1m])))/1000 > 2'
+        for: 30s
+        labels:
+          severity: notification
+          type: latency
+        annotations:
+          summary: Average Latency Warning
+          description: "High Latency - DB: {{$labels.db}} Latency: {{$value}} ms"
+          runbook: https://redis.io/docs/latest/integrate/prometheus-with-redis-enterprise/observability/#read-latency
+      - alert: Average Latency Critical
+        expr: '(sum by (db)(irate(endpoint_read_requests_latency_histogram_sum[1m]) + irate(endpoint_write_requests_latency_histogram_sum[1m]) + irate(endpoint_other_requests_latency_histogram_sum[1m])))/(sum by (db)(irate(endpoint_read_requests[1m]) + irate(endpoint_write_requests[1m]) + irate(endpoint_other_requests[1m])))/1000 > 5'
+        for: 30s
+        labels:
+          severity: notification
+          type: latency
+        annotations:
+          summary: Average Latency Warning
+          description: "High Latency - DB: {{$labels.db}} Latency: {{$value}} ms"
+          runbook: https://redis.io/docs/latest/integrate/prometheus-with-redis-enterprise/observability/#read-latency
diff --git a/prometheus_v2/rules/throughput-alerts.yml b/prometheus_v2/rules/throughput-alerts.yml
@@ -2,7 +2,7 @@ groups:
   - name: throughput
     rules:
       - alert: No Redis Requests
-        expr: endpoint_read_requests < 1 and endpoint_write_requests < 1
+        expr: increase(endpoint_read_requests[1m]) < 1 and increase(endpoint_write_requests[1m]) < 1
         for: 30s
         labels:
           severity: critical
@@ -12,7 +12,7 @@ groups:
           description: "Too few Redis operations - Cluster: {{$labels.cluster}} DB: {{$labels.db}}  {{$value}} (ops/sec)"
           runbook: https://redis.io/docs/latest/integrate/prometheus-with-redis-enterprise/observability/#connections
       - alert: Excessive Redis Requests
-        expr: (endpoint_read_requests + endpoint_write_requests)/2 > 1000000
+        expr: (irate(endpoint_read_requests[1m]) + irate(endpoint_write_requests[1m]))/2 > 100000 # The actual threshold will be based on your Redis Database, adjust as needed 
         for: 30s
         labels:
           severity: critical
diff --git a/prometheus_v2/rules/utilization-alerts.yml b/prometheus_v2/rules/utilization-alerts.yml
@@ -2,7 +2,7 @@ groups:
   - name: utilization
     rules:
       - alert: Low Hit Ratio
-        expr: (redis_server_keyspace_read_hits{role="master"}/(redis_server_keyspace_read_hits{role="master"} + redis_server_keyspace_read_misses{role="master"})) < 1
+        expr: (irate(redis_server_keyspace_read_hits{role="master"}[1m])/irate(redis_server_keyspace_read_misses{role="master"}[1m])) < 1
         for: 30s
         labels:
           severity: notification

Original file line number	Diff line number	Diff line change
`@@ -265,7 +265,7 @@ resource "null_resource" "run-kickstart" {`
`265`	`265`	`inline = [`
`266`	`266`	`"cd redis-enterprise-observability/grafana_v2/kickstart_v2",`
`267`	`267`	`"git checkout main",`
`268`		`- "./setup.sh ${local.redis_db_primary_fqdn} ../dashboards/grafana_v9-11/cloud/basic",`
	`268`	`+ "./setup.sh ${local.redis_db_primary_fqdn} ../dashboards/grafana_v9-11/cloud/basic ${var.grafana_password}",`
`269`	`269`	`]`
`270`	`270`	`}`
`271`	`271`	`}`
Original file line number	Diff line number	Diff line change
`@@ -205,8 +205,8 @@ resource "null_resource" "run-kickstart" {`
`205`	`205`	`provisioner "remote-exec" {`
`206`	`206`	`inline = [`
`207`	`207`	`"cd redis-enterprise-observability/grafana_v2/kickstart_v2",`
`208`		`- "git checkout main",`
`209`		`- "./setup.sh ${local.redis_db_primary_fqdn} ../dashboards/grafana_v9-11/cloud/basic",`
	`208`	`+ "git checkout ${var.git_branch}",`
	`209`	`+ "./setup.sh ${local.redis_db_primary_fqdn} ../dashboards/grafana_v9-11/cloud/basic ${var.grafana_password}",`
`210`	`210`	`]`
`211`	`211`	`}`
`212`	`212`	`}`