Update signals/panels descriptions

v-zhuravlev · v-zhuravlev · commit 3eedc1fd8f54 · 2025-10-22T00:47:20.000+03:00
diff --git a/kafka-observ-lib/signals/broker.libsonnet b/kafka-observ-lib/signals/broker.libsonnet
@@ -15,7 +15,10 @@ function(this)
     signals: {
       brokerMessagesInPerSec: {
         name: 'Broker messages in',
-        description: 'Broker messages in.',
+        description: |||
+          Rate of incoming messages published to this broker across all topics.  
+          Tracks producer throughput and write workload.  
+        |||,
         type: 'counter',
         unit: 'mps',
         sources: {
@@ -32,7 +35,10 @@ function(this)
       },
       brokerBytesInPerSec: {
         name: 'Broker bytes in',
-        description: 'Broker bytes in rate.',
+        description: |||
+          Rate of incoming data in bytes published to this broker from producers.  
+          Measures network and storage write load.  
+        |||,
         type: 'counter',
         unit: 'Bps',
         sources: {
@@ -49,7 +55,10 @@ function(this)
       },
       brokerBytesOutPerSec: {
         name: 'Broker bytes out',
-        description: 'Broker bytes out rate.',
+        description: |||
+          Rate of outgoing data in bytes sent from this broker to consumers and followers.  
+          Measures network read load and consumer throughput.  
+        |||,
         type: 'counter',
         unit: 'Bps',
         sources: {
diff --git a/kafka-observ-lib/signals/brokerReplicaManager.libsonnet b/kafka-observ-lib/signals/brokerReplicaManager.libsonnet
@@ -80,7 +80,10 @@ function(this)
       onlinePartitions: {
         name: 'Online partitions',
         description: |||
-          Online partitions.
+          Number of partitions that are currently online and available on this broker. This includes
+          partitions where this broker is either the leader or a follower replica. The total count
+          reflects the broker's share of the topic partitions across the cluster. A sudden drop in
+          online partitions may indicate broker issues, partition reassignments, or cluster rebalancing.
         |||,
         type: 'gauge',
         unit: 'short',
@@ -103,7 +106,11 @@ function(this)
       offlinePartitions: {
         name: 'Offline partitions',
         description: |||
-          Number of partitions that dont have an active leader and are hence not writable or readable.
+          Number of partitions that don't have an active leader and are hence not writable or readable.
+          Offline partitions indicate a critical availability issue as producers cannot write to these
+          partitions and consumers cannot read from them. This typically occurs when all replicas for
+          a partition are down or when there are not enough in-sync replicas to elect a new leader.
+          Any non-zero value requires immediate investigation and remediation to restore service availability.
         |||,
         type: 'gauge',
         unit: 'short',
@@ -125,7 +132,11 @@ function(this)
       underReplicatedPartitions: {
         name: 'Under replicated partitions',
         description: |||
-          Number of under replicated partitions (| ISR | < | all replicas |).
+          Number of partitions that have fewer in-sync replicas (ISR) than the configured replication factor.
+          Under-replicated partitions indicate potential data availability issues, as there are fewer copies
+          of the data than desired. This could be caused by broker failures, network issues, or brokers
+          falling behind in replication. A high number of under-replicated partitions poses a risk to
+          data durability and availability, as the loss of additional brokers could result in data loss.
         |||,
         type: 'gauge',
         unit: 'short',
@@ -145,7 +156,11 @@ function(this)
       underMinISRPartitions: {
         name: 'Under min ISR partitions',
         description: |||
-          Under min ISR(In-Sync replicas) partitions.
+          Number of partitions that have fewer in-sync replicas (ISR) than the configured minimum ISR threshold.
+          When the number of ISRs for a partition falls below the min.insync.replicas setting, the partition
+          becomes unavailable for writes (if acks=all is configured), which helps prevent data loss but impacts
+          availability. This metric indicates potential issues with broker availability, network connectivity,
+          or replication lag that need immediate attention to restore write availability.
         |||,
         type: 'gauge',
         unit: 'short',
@@ -165,7 +180,12 @@ function(this)
       uncleanLeaderElection: {
         name: 'Unclean leader election',
         description: |||
-          Unclean leader election rate.
+          Rate of unclean leader elections occurring in the cluster. An unclean leader election happens
+          when a partition leader fails and a replica that was not fully in-sync (not in the ISR) is
+          elected as the new leader. This results in potential data loss as the new leader may be missing
+          messages that were committed to the previous leader. Unclean elections occur when unclean.leader.election.enable
+          is set to true and there are no in-sync replicas available. Any occurrence of unclean elections
+          indicates a serious problem with cluster availability and replication health that risks data integrity.
         |||,
         type: 'raw',
         unit: 'short',
@@ -182,10 +202,16 @@ function(this)
             },
         },
       },
-      preferredReplicaInbalance: {
-        name: 'Preferred replica inbalance',
+      preferredReplicaImbalance: {
+        name: 'Preferred replica imbalance',
         description: |||
-          The count of topic partitions for which the leader is not the preferred leader.
+          The count of topic partitions for which the leader is not the preferred leader. In Kafka,
+          each partition has a preferred leader replica (typically the first replica in the replica list).
+          When leadership is not on the preferred replica, the cluster may experience uneven load distribution
+          across brokers, leading to performance imbalances. This can occur after broker failures and restarts,
+          or during cluster maintenance. Running the preferred replica election can help rebalance leadership
+          and optimize cluster performance. A consistently high imbalance may indicate issues with automatic
+          leader rebalancing or the need for manual intervention.
         |||,
         type: 'gauge',
         unit: 'short',
diff --git a/kafka-observ-lib/signals/cluster.libsonnet b/kafka-observ-lib/signals/cluster.libsonnet
@@ -17,7 +17,9 @@ function(this)
       activeControllers: {
         name: 'Active kafka controllers',
         description: |||
-          Active kafka controllers count.
+          Number of active controllers in the cluster. Should always be exactly 1.  
+          Zero indicates no controller elected, preventing cluster operations.  
+          More than one indicates split-brain requiring immediate attention.
         |||,
         type: 'gauge',
         unit: 'short',
@@ -41,7 +43,10 @@ function(this)
       role: {
         name: 'Current role',
         description: |||
-          0 - follower, 1 - controller.
+          Broker's current controller role: 0 indicates follower, 1 indicates active controller.  
+          Only one broker should have value 1 at any time.  
+          Used to identify which broker is managing cluster metadata and leadership.
+          Current controller role: 0 - follower, 1 - controller.
         |||,
         type: 'gauge',
         unit: 'short',
@@ -119,7 +124,9 @@ function(this)
       kraftBrokerRole: {
         name: 'Current role (kraft)',
         description: |||
-          Any value - broker in kraft.
+          Broker state in KRaft mode (Kafka without ZooKeeper).  
+          Any value indicates the broker is running in KRaft mode.  
+          Used to identify KRaft-enabled brokers in the cluster.
         |||,
         type: 'gauge',
         unit: 'short',
@@ -155,7 +162,7 @@ function(this)
       brokersCount: {
         name: 'Brokers count',
         description: |||
-          Active brokers count.
+          Total number of active brokers currently registered and reporting in the cluster.  
         |||,
         type: 'gauge',
         unit: 'short',
@@ -178,7 +185,10 @@ function(this)
 
       clusterMessagesInPerSec: {
         name: 'Cluster messages in',
-        description: 'Cluster messages in.',
+        description: |||
+          Aggregate rate of incoming messages across all brokers and topics in the cluster.  
+          Represents total producer throughput and write workload.  
+        |||,
         type: 'counter',
         unit: 'mps',
         sources: {
@@ -195,7 +205,10 @@ function(this)
       },
       clusterBytesInPerSec: {
         name: 'Cluster bytes in',
-        description: 'Cluster bytes in rate.',
+        description: |||
+          Aggregate rate of incoming data in bytes across all brokers from producers.  
+          Measures total network ingress and storage write load.  
+        |||,
         type: 'counter',
         unit: 'Bps',
         sources: {
@@ -212,7 +225,10 @@ function(this)
       },
       clusterBytesOutPerSec: {
         name: 'Cluster bytes out',
-        description: 'Cluster bytes out rate.',
+        description: |||
+          Aggregate rate of outgoing data in bytes across all brokers to consumers and followers.  
+          Measures total network egress load and consumer throughput.  
+        |||,
         type: 'counter',
         unit: 'Bps',
         sources: {
diff --git a/kafka-observ-lib/signals/consumerGroup.libsonnet b/kafka-observ-lib/signals/consumerGroup.libsonnet
@@ -16,7 +16,11 @@ function(this)
     signals: {
       consumerGroupLag: {
         name: 'Consumer group lag',
-        description: 'Current approximate lag of a ConsumerGroup at Topic/Partition.',
+        description: |||
+          Number of messages a consumer group is behind the latest available offset for a topic partition.  
+          High or growing lag indicates consumers can't keep up with producer throughput.  
+          Critical metric for consumer health and real-time processing requirements.
+        |||,
         type: 'gauge',
         unit: 'short',
         aggFunction: 'sum',
@@ -35,7 +39,11 @@ function(this)
 
       consumerGroupLagTime: {
         name: 'Consumer group lag in ms',
-        description: 'Current approximate lag of a ConsumerGroup at Topic/Partition.',
+        description: |||
+          Time lag in milliseconds between message production and consumption for a consumer group.  
+          Represents real-time delay in message processing.  
+          More intuitive than message count lag for understanding business impact of delays.
+        |||,
         type: 'gauge',
         unit: 'ms',
         optional: true,
@@ -50,7 +58,11 @@ function(this)
 
       consumerGroupConsumeRate: {
         name: 'Consumer group consume rate',
-        description: 'Consumer group consume rate.',
+        description: |||
+          Rate at which a consumer group is consuming and committing offsets for a topic.  
+          Measures consumer throughput and processing speed.  
+          Should match or exceed producer rate to prevent growing lag.
+        |||,
         type: 'counter',
         unit: 'mps',
         sources: {
diff --git a/kafka-observ-lib/signals/conversion.libsonnet b/kafka-observ-lib/signals/conversion.libsonnet
@@ -15,7 +15,11 @@ function(this)
     signals: {
       producerConversion: {
         name: 'Message conversion (producer)',
-        description: 'The number of messages produced converted to match the log.message.format.version.',
+        description: |||
+          Rate of producer messages requiring format conversion to match broker's log.message.format.version.  
+          Conversions add CPU overhead and latency.  
+          Non-zero values suggest producer and broker version mismatches requiring alignment.
+        |||,
         type: 'counter',
         unit: 'mps',
         sources: {
@@ -32,7 +36,11 @@ function(this)
       },
       consumerConversion: {
         name: 'Message conversion (consumer)',
-        description: 'The number of messages consumed converted at consumer to match the log.message.format.version.',
+        description: |||
+          Rate of messages requiring format conversion during consumer fetch to match log.message.format.version.  
+          Conversions impact broker CPU and consumer latency.  
+          Indicates version mismatch between stored messages and consumer expectations.
+        |||,
         type: 'counter',
         unit: 'mps',
         sources: {
diff --git a/kafka-observ-lib/signals/topic.libsonnet b/kafka-observ-lib/signals/topic.libsonnet
@@ -16,7 +16,11 @@ function(this)
     signals: {
       topicMessagesPerSec: {
         name: 'Messages in per second',
-        description: 'Messages in per second.',
+        description: |||
+          Rate of messages produced to this topic across all partitions.  
+          Indicates topic write activity and producer throughput.  
+          Use to identify hot topics and understand data flow patterns.
+        |||,
         type: 'counter',
         unit: 'mps',
         sources: {
@@ -31,7 +35,9 @@ function(this)
       // used in table:
       topicMessagesPerSecByPartition: {
         name: 'Messages in per second',
-        description: 'Messages in per second.',
+        description: |||
+          Rate of messages produced to each partition within this topic.  
+        |||,
         type: 'counter',
         unit: 'mps',
         legendCustomTemplate: '{{ topic }}/{{ partition }}',
@@ -47,7 +53,9 @@ function(this)
       // JMX exporter extras
       topicBytesInPerSec: {
         name: 'Topic bytes in',
-        description: 'Topic bytes in rate.',
+        description: |||
+          Rate of incoming data in bytes written to this topic from producers.  
+        |||,
         type: 'counter',
         unit: 'Bps',
         sources: {
@@ -67,7 +75,9 @@ function(this)
       },
       topicBytesOutPerSec: {
         name: 'Topic bytes out',
-        description: 'Topic bytes out rate.',
+        description: |||
+          Rate of outgoing data in bytes read from this topic by consumers.  
+        |||,
         type: 'counter',
         unit: 'Bps',
         sources: {
@@ -87,7 +97,9 @@ function(this)
       },
       topicLogStartOffset: {
         name: 'Topic start offset',
-        description: 'Topic start offset.',
+        description: |||
+          Earliest available offset for each partition due to retention or deletion.  
+        |||,
         type: 'gauge',
         unit: 'none',
         aggFunction: 'max',
@@ -109,7 +121,11 @@ function(this)
       },
       topicLogEndOffset: {
         name: 'Topic end offset',
-        description: 'Topic end offset.',
+        description: |||
+          Latest offset (high water mark) for each partition representing newest available message.  
+          Continuously increases as new messages arrive.  
+          Difference between end and start offsets indicates total messages available.
+        |||,
         type: 'gauge',
         unit: 'none',
         aggFunction: 'max',
@@ -126,7 +142,10 @@ function(this)
       },
       topicLogSize: {
         name: 'Topic log size',
-        description: 'Size in bytes of the current topic-partition.',
+        description: |||
+          Total size in bytes of data stored on disk for each topic partition.  
+          Grows with incoming messages and shrinks with retention cleanup.  
+        |||,
         type: 'gauge',
         unit: 'decbytes',
         aggFunction: 'max',
diff --git a/kafka-observ-lib/signals/totalTime.libsonnet b/kafka-observ-lib/signals/totalTime.libsonnet
@@ -16,23 +16,27 @@ function(this)
     signals: {
 
       local commonRequestQueueDescription = |||
-        A high value can imply there aren't enough IO threads or the CPU is a bottleneck, 
-        or the request queue isnt large enough. The request queue size should match the number of connections.
+        High values indicate insufficient IO threads, CPU bottlenecks, or undersized request queue.  
+        Queue size should match connection count.
       |||,
 
       local commonLocalDescription = |||
-        In most cases, a high value can imply slow local storage or the storage is a bottleneck. One should also investigate LogFlushRateAndTimeMs to know how long page flushes are taking, which will also indicate a slow disk. In the case of FetchFollower requests, time spent in LocalTimeMs can be the result of a ZooKeeper write to change the ISR.
+        High values often indicate slow storage or disk bottlenecks.  
+        Check LogFlushRateAndTimeMs for disk performance issues.
       |||,
 
       local commonRemoteDescription = |||
+        For fetch requests, high values may indicate caught-up consumers with no new data (normal if near max wait time).  
+        Configure via replica.fetch.wait.max.ms and fetch.max.wait.ms.
       |||,
 
       local commonResponseQueueDescription = |||
-        A high value can imply there aren't enough network threads or the network cant dequeue responses quickly enough, causing back pressure in the response queue. 
+        High values indicate insufficient network threads or slow network dequeue causing backpressure.
       |||,
 
       local commonResponseDescription = |||
-        A high value can imply the zero-copy from disk to the network is slow, or the network is the bottleneck because the network cant dequeue responses of the TCP socket as quickly as theyre being created. If the network buffer gets full, Kafka will block. 
+        High values indicate slow zero-copy operations or network saturation.  
+        Network buffer fullness can cause Kafka to block.
       |||,
 
       fetchQueueTime: {
diff --git a/kafka-observ-lib/signals/zookeeperClient.libsonnet b/kafka-observ-lib/signals/zookeeperClient.libsonnet