|
| 1 | +apiVersion: v1 |
| 2 | +kind: Alert |
| 3 | +app: kafka |
| 4 | +version: 1.0.0 |
| 5 | +appVersion: |
| 6 | +- '2.7' |
| 7 | +descriptionFile: ALERTS.md |
| 8 | +configurations: |
| 9 | +- kind: Prometheus |
| 10 | + data: |- |
| 11 | + groups: |
| 12 | + - name: Kafka |
| 13 | + rules: |
| 14 | + - alert: '[Kafka] No Leader' |
| 15 | + expr: | |
| 16 | + sum(kafka_controller_active_controller) < 1 |
| 17 | + for: 5m |
| 18 | + labels: |
| 19 | + severity: critical |
| 20 | + annotations: |
| 21 | + description: There is no ActiveController or 'leader' in the Kafka cluster. |
| 22 | + - alert: '[Kafka] Too Many Leaders' |
| 23 | + expr: | |
| 24 | + sum(kafka_controller_active_controller) > 1 |
| 25 | + for: 10m |
| 26 | + labels: |
| 27 | + severity: critical |
| 28 | + annotations: |
| 29 | + description: There is more than one ActiveController or 'leader' in the Kafka cluster. |
| 30 | + - alert: '[Kafka] Offline Partitions' |
| 31 | + expr: | |
| 32 | + sum(kafka_controller_offline_partitions) > 0 |
| 33 | + for: 5m |
| 34 | + labels: |
| 35 | + severity: critical |
| 36 | + annotations: |
| 37 | + description: There are one or more Offline Partitions. These partitions don’t have an active leader and are hence not writable or readable. |
| 38 | + - alert: '[Kafka] Under Replicated Partitions' |
| 39 | + expr: | |
| 40 | + sum(kafka_server_under_replicated_partitions) > 0 |
| 41 | + for: 10m |
| 42 | + labels: |
| 43 | + severity: warning |
| 44 | + annotations: |
| 45 | + description: There are one or more Under Replicated Partitions. |
| 46 | + - alert: '[Kafka] Under In-Sync Replicated Partitions' |
| 47 | + expr: | |
| 48 | + sum(kafka_server_under_isr_partitions) > 0 |
| 49 | + for: 10m |
| 50 | + labels: |
| 51 | + severity: warning |
| 52 | + annotations: |
| 53 | + description: There are one or more Under In-Sync Replicated Partitions. These partitions will be unavailable to producers who use 'acks=all'. |
| 54 | + - alert: '[Kafka] ConsumerGroup Lag Not Decreasing' |
| 55 | + expr: | |
| 56 | + (sum by(kube_cluster_name, kube_namespace_name, kube_workload_name, consumergroup, topic)(kafka_consumergroup_lag) > 0) |
| 57 | + and |
| 58 | + (sum by(kube_cluster_name, kube_namespace_name, kube_workload_name, consumergroup, topic)(delta(kafka_consumergroup_lag[2m])) >= 0) |
| 59 | + for: 15m |
| 60 | + labels: |
| 61 | + severity: warning |
| 62 | + annotations: |
| 63 | + description: The ConsumerGroup lag is not decreasing. The Consumers might be down, failing to process the messages and continuously retrying, or their consumption rate is lower than the production rate of messages. |
| 64 | + - alert: '[Kafka] ConsumerGroup Without Members' |
| 65 | + expr: | |
| 66 | + sum by(kube_cluster_name, kube_namespace_name, kube_workload_name, consumergroup)(kafka_consumergroup_members) == 0 |
| 67 | + for: 10m |
| 68 | + labels: |
| 69 | + severity: info |
| 70 | + annotations: |
| 71 | + description: The ConsumerGroup doesn't have any members. |
| 72 | + - alert: '[Kafka] Producer High ThrottleTime By Client-Id' |
| 73 | + expr: | |
| 74 | + max by(kube_cluster_name, kube_namespace_name, kube_workload_name, client_id)(kafka_server_producer_client_throttle_time) > 1000 |
| 75 | + for: 5m |
| 76 | + labels: |
| 77 | + severity: warning |
| 78 | + annotations: |
| 79 | + description: The Producer has reached its quota and has high throttle time. Applicable when Client-Id-only quotas are being used. |
| 80 | + - alert: '[Kafka] Producer High ThrottleTime By User' |
| 81 | + expr: | |
| 82 | + max by(kube_cluster_name, kube_namespace_name, kube_workload_name, user)(kafka_server_producer_user_throttle_time) > 1000 |
| 83 | + for: 5m |
| 84 | + labels: |
| 85 | + severity: warning |
| 86 | + annotations: |
| 87 | + description: The Producer has reached its quota and has high throttle time. Applicable when User-only quotas are being used. |
| 88 | + - alert: '[Kafka] Producer High ThrottleTime By User And Client-Id' |
| 89 | + expr: | |
| 90 | + max by(kube_cluster_name, kube_namespace_name, kube_workload_name, user, client_id)(kafka_server_producer_user_client_throttle_time) > 1000 |
| 91 | + for: 5m |
| 92 | + labels: |
| 93 | + severity: warning |
| 94 | + annotations: |
| 95 | + description: The Producer has reached its quota and has high throttle time. Applicable when Client-Id + User quotas are being used. |
| 96 | + - alert: '[Kafka] Consumer High ThrottleTime By Client-Id' |
| 97 | + expr: | |
| 98 | + max by(kube_cluster_name, kube_namespace_name, kube_workload_name, client_id)(kafka_server_consumer_client_throttle_time) > 1000 |
| 99 | + for: 5m |
| 100 | + labels: |
| 101 | + severity: warning |
| 102 | + annotations: |
| 103 | + description: The Consumer has reached its quota and has high throttle time. Applicable when Client-Id-only quotas are being used. |
| 104 | + - alert: '[Kafka] Consumer High ThrottleTime By User' |
| 105 | + expr: | |
| 106 | + max by(kube_cluster_name, kube_namespace_name, kube_workload_name, user)(kafka_server_consumer_user_throttle_time) > 1000 |
| 107 | + for: 5m |
| 108 | + labels: |
| 109 | + severity: warning |
| 110 | + annotations: |
| 111 | + description: The Consumer has reached its quota and has high throttle time. Applicable when User-only quotas are being used. |
| 112 | + - alert: '[Kafka] Consumer High ThrottleTime By User And Client-Id' |
| 113 | + expr: | |
| 114 | + max by(kube_cluster_name, kube_namespace_name, kube_workload_name, user, client_id)(kafka_server_consumer_user_client_throttle_time) > 1000 |
| 115 | + for: 5m |
| 116 | + labels: |
| 117 | + severity: warning |
| 118 | + annotations: |
| 119 | + description: The Consumer has reached its quota and has high throttle time. Applicable when Client-Id + User quotas are being used. |
0 commit comments