Ra metrics fixes and release notes (#14739) (#14759)

mergify[bot] · mkuratczyk · web-flow · commit 7781829825f5 · 2025-10-20T16:49:11.000+02:00
* Return the correct Ra metrics per endpoint /metrics returns all coordination system metrics and some high-level QQ values /metrics/per-object returns all coordination metrics and selected metrics for quorum queues /metrics/detailed * if no vhost filter provided, returns all per-object metrics for all queues as well as all coordination system metrics * if a vhost filter is provided, returns all per-object metrics for all queues in that vhost, but doesn't return coordination metrics * release notes: document Ra Metrics -> Ra Counters transition (cherry picked from commit e9baade) Co-authored-by: Michal Kuratczyk <michal.kuratczyk@broadcom.com>
diff --git a/deps/rabbitmq_prometheus/src/collectors/prometheus_rabbitmq_raft_metrics_collector.erl b/deps/rabbitmq_prometheus/src/collectors/prometheus_rabbitmq_raft_metrics_collector.erl
@@ -53,8 +53,8 @@ collect_mf(_Registry, Callback) ->
 %% INTERNAL
 
 collect_aggregate_metrics(Prefix, Callback) ->
-    collect_max_values(Prefix, Callback),
-    collect_key_component_metrics(Prefix, Callback).
+    collect_key_component_metrics(Prefix, Callback),
+    collect_max_values(Prefix, Callback).
 
 collect_per_object_metrics(Prefix, Callback) ->
     collect_key_component_metrics(Prefix, Callback),
@@ -71,8 +71,6 @@ collect_detailed_metrics(Prefix, Callback) ->
                                      false
                              end
                      end,
-
-    collect_key_component_metrics(Prefix, Callback),
     collect_all_matching_metrics(Prefix, Callback, VHostFilterFun).
 
 collect_key_per_object_metrics(Prefix, Callback) ->
@@ -91,7 +89,10 @@ collect_key_per_object_metrics(Prefix, Callback) ->
                           Type,
                           Values))
       end,
-      seshat:format(ra, #{labels => as_binary, metrics => QQMetrics})).
+      seshat:format(ra,
+                    #{labels => as_binary,
+                      metrics => QQMetrics,
+                      filter_fun => fun onlyQueues/1})).
 
 collect_all_matching_metrics(Prefix, Callback, VHostFilterFun) ->
     maps:foreach(
@@ -106,7 +107,10 @@ collect_all_matching_metrics(Prefix, Callback, VHostFilterFun) ->
                           Type,
                           Values))
       end,
-      seshat:format(ra, #{labels => as_binary, metrics => all, filter_fun => VHostFilterFun})).
+      seshat:format(ra,
+                    #{labels => as_binary,
+                      metrics => all,
+                      filter_fun => VHostFilterFun})).
 
 collect_max_values(Prefix, Callback) ->
     %% max values for QQ metrics
@@ -115,20 +119,26 @@ collect_max_values(Prefix, Callback) ->
     %% rabbitmq_raft_num_segments{queue="q2",vhost="/"} 10.0
     %% becomes
     %% rabbitmq_raft_max_num_segments 10.0
-    QQMetrics = [num_segments],
+    QQMetrics = [num_segments, commit_latency],
     maps:foreach(
       fun(Name, #{type := Type, help := Help, values := Values}) ->
               Max = lists:max(maps:values(Values)),
               Callback(
                 create_mf(<<Prefix/binary, "max_", (prometheus_model_helpers:metric_name(Name))/binary>>,
                           Help,
                           Type,
-                          #{#{} => Max}))
+                          %% TODO: this should not be hardcoded, we should
+                          %% something more like 'max() GROUP BY ra_system'
+                          #{#{ra_system => quorum_queues} => Max}))
 
       end,
-      seshat:format(ra, #{labels => as_binary, metrics => QQMetrics})).
+      seshat:format(ra,
+                    #{labels => as_binary,
+                      metrics => QQMetrics,
+                      filter_fun => fun onlyQueues/1})).
 
 collect_key_component_metrics(Prefix, Callback) ->
+    %% quorum queue metrics
     WALMetrics = [wal_files, bytes_written, mem_tables],
     SegmentWriterMetrics = [entries, segments],
     maps:foreach(
@@ -139,4 +149,25 @@ collect_key_component_metrics(Prefix, Callback) ->
                           Type,
                           Values))
       end,
-      seshat:format(ra, #{labels => as_binary, metrics => WALMetrics ++ SegmentWriterMetrics})).
+      seshat:format(ra,
+                    #{labels => as_binary,
+                      metrics => WALMetrics ++ SegmentWriterMetrics,
+                      filter_fun => fun onlyQueues/1})),
+    %% Khepri and other coordination metrics
+    maps:foreach(
+      fun(Name, #{type := Type, help := Help, values := Values}) ->
+              Callback(
+                create_mf(<<Prefix/binary, (prometheus_model_helpers:metric_name(Name))/binary>>,
+                          Help,
+                          Type,
+                          Values))
+      end,
+      seshat:format(ra,
+                    #{labels => as_binary,
+                      filter_fun => fun onlyCoordinationSystem/1})).
+
+onlyCoordinationSystem(#{ra_system := coordination}) -> true;
+onlyCoordinationSystem(_) -> false.
+
+onlyQueues(#{queue := _}) -> true;
+onlyQueues(_) -> false.
diff --git a/deps/rabbitmq_prometheus/test/rabbit_prometheus_http_SUITE.erl b/deps/rabbitmq_prometheus/test/rabbit_prometheus_http_SUITE.erl
@@ -411,7 +411,8 @@ aggregated_metrics_test(Config) ->
     ?assertEqual(match, re:run(Body, "^rabbitmq_io_read_time_seconds_total ", [{capture, none}, multiline])),
     %% Check the first TOTALS metric value
     ?assertEqual(match, re:run(Body, "^rabbitmq_connections ", [{capture, none}, multiline])),
-    ?assertEqual(nomatch, re:run(Body, "^rabbitmq_raft_commit_latency_seconds", [{capture, none}, multiline])),
+    ?assertEqual(match, re:run(Body, "^rabbitmq_raft_commit_latency_seconds", [{capture, none}, multiline])),
+    ?assertEqual(match, re:run(Body, "^rabbitmq_raft_max_commit_latency_seconds", [{capture, none}, multiline])),
     ?assertEqual(match, re:run(Body, "^rabbitmq_raft_bytes_written.*ra_log_segment_writer", [{capture, none}, multiline])),
     ?assertEqual(match, re:run(Body, "^rabbitmq_raft_bytes_written.*ra_log_wal", [{capture, none}, multiline])),
     ?assertEqual(match, re:run(Body, "^rabbitmq_raft_entries{", [{capture, none}, multiline])),
@@ -874,16 +875,10 @@ detailed_raft_metrics_test(Config) ->
     QQMetrics = #{#{queue => "a_quorum_queue", vhost => "/"} => ["1.0"]},
 
     {_, Body1} = http_get_with_pal(Config, "/metrics/detailed?family=ra_metrics&vhost=foo", [], 200),
-    %% no queues in vhost foo, so no QQ metrics
-    ?assertEqual(ComponentMetrics,
-                 map_get(rabbitmq_detailed_raft_wal_files, parse_response(Body1))),
     ?assertEqual(undefined,
                  maps:get(rabbitmq_detailed_raft_term, parse_response(Body1), undefined)),
 
     {_, Body2} = http_get_with_pal(Config, "/metrics/detailed?family=ra_metrics&vhost=/", [], 200),
-    %% there's a queue in vhost /
-    ?assertEqual(ComponentMetrics,
-                 map_get(rabbitmq_detailed_raft_wal_files, parse_response(Body2))),
     ?assertEqual(QQMetrics,
                  map_get(rabbitmq_detailed_raft_term, parse_response(Body2))),
 
diff --git a/release-notes/4.2.0.md b/release-notes/4.2.0.md
@@ -28,6 +28,14 @@ In other words, if the responder publishes to only this queue name, then the mes
 `*.cacerts` (not to be confused with `cacertfile`) settings in `rabbitmq.conf` did not have the expected effect and were removed
 to eliminate confusion.
 
+### Quorum Queue Metric Changes
+
+Metrics emitted for Ra-based components (quorum queues, Khepri, Stream Coordinator)
+have changed. Some metrics were removed, many were added, some changed their names.
+Users relying on Prometheus metrics starting with `rabbitmq_raft` or `rabbitmq_detailed_raft`
+will need to update their dashboards and/or alerts. If you are using the
+[RabbitMQ-Quorum-Queues-Raft dashboard](https://grafana.com/grafana/dashboards/11340-rabbitmq-quorum-queues-raft/),
+please update it to the latest version for RabbitMQ 4.2 compatibility.
 
 ## Release Highlights
 
@@ -407,6 +415,88 @@ compared to other versions.
  * `cuttlefish` was upgraded to [`3.5.0`](https://github.com/kyorai/cuttlefish/releases)
 
 
+## Ra Metric Changes
+
+Metrics emitted for Ra-based components (quorum queues, Khepri, Stream Coordinator)
+have changed. Some metrics were removed, many were added, some changed their names.
+For most users this should not require any action. However, users relying on Prometheus
+metrics starting with `rabbitmq_raft` or `rabbitmq_detailed_raft` will need to update
+their dashboards and/or alerts. If you are using the
+[RabbitMQ-Quorum-Queues-Raft dashboard](https://grafana.com/grafana/dashboards/11340-rabbitmq-quorum-queues-raft/),
+please update it to the latest version for RabbitMQ 4.2 compatibility.
+
+#### More Accurate and Detailed Ra Metrics 
+
+Ra is an internal component implementing the Raft protocol. It's the basis
+for quorum queues, as well as some internal components (currently Khepri
+and Stream Coordinator). For quite some time, Ra metrics were tracked in two places
+but RabbitMQ relied on the old metric subsystem. In RabbitMQ 4.2, the old
+Ra metrics subsystem has been removed and RabbitMQ now reports Ra metrics
+from the new subsystem (implemented using [Seshat](https://github.com/rabbitmq/seshat) library).
+This migration has the following benefits:
+
+* lower overhead, since only one subsystem is used
+* more up-to-date information - the old subsystem was only refreshed every 5 seconds,
+  the new subsystem always returns the latest values
+* additional metrics are exposed, making it easier to debug the system if necessary
+
+### Aggregated metrics (/metrics endpoint)
+
+* `rabbitmq_raft_num_segments` was added; it reports the number of segment files of the internal components
+
+* `rabbitmq_raft_max_num_segments` was added; it reports the highest number of segment
+   files of any of the quorum queues; per-object metrics can be used to find which queue
+   has a high number of segment files
+
+* `rabbitmq_raft_term_total` has been removed
+   this metric was emitted accidentally as a side effect of metric aggregation;
+   the sum of Raft terms across all Raft clusters is a meaningless number
+
+* some metrics contained the `_log_` substring in their name, even though they are not related to the Raft log;
+  hence, they were renamed to avoid the misleading part:
+    * `rabbitmq_raft_log_snapshot_index` -> `rabbitmq_raft_snapshot_index`
+    * `rabbitmq_raft_log_last_applied_index` -> `rabbitmq_raft_last_applied`
+    * `rabbitmq_raft_log_commit_index` -> `rabbitmq_raft_commit_index`
+    * `rabbitmq_raft_log_last_written_index` -> `rabbitmq_raft_last_written_index`
+
+* `rabbitmq_raft_entry_commit_latency_seconds` has been removed; it was an average latency across all Ra clusters
+  in all Ra systems (RabbitMQ currently uses two separate Ra systems: one for quorum queues and one for internal
+  components, currently Khepri and Stream Coordinator); it was therefore not very useful, since different
+  components can have very different latencies
+
+* `rabbitmq_raft_commit_latency_seconds` was added; in case of aggregated metrics, it is only reported for
+  internal components (currently Khepri and Stream Coordinator)
+
+* `rabbitmq_raft_max_commit_latency_seconds` has been added; it's the highest commit latency reported by any
+  of the quorum queues. When it's high, per-object can be used to find which specific queue reports high commit latency
+
+### Per-object metrics (/metrics/per-object endpoint)
+
+More metrics are reported for each queue than in older versions.
+
+Incorrect metric names were corrected as described above.
+
+Additionally:
+* `rabbitmq_raft_term_total` has been renamed to `rabbitmq_raft_term` (the "total" suffix
+  was incorrect and misleading, since the metrics is reported for each specific Ra cluster)
+
+* `rabbitmq_raft_num_segments` was added; it reports the number of segment files of the internal components
+  and for each quorum queue
+
+### Detailed metrics (/metrics/detailed endpoint)
+
+When the detailed endpoints is scraped with `family=ra_metrics` parameter,
+more metrics are reported for each queue than in older versions.
+
+Incorrect metric names were corrected as described above.
+
+Additionally:
+* `rabbitmq_raft_term_total` has been renamed to `rabbitmq_raft_term` (the "total" suffix
+  was incorrect and misleading, since the metrics is reported for each specific Ra cluster)
+
+* `rabbitmq_raft_num_segments` was added; it reports the number of segment files of the internal components
+  and for each quorum queue
+
 ## Source Code Archives
 
 To obtain source code of the entire distribution, please download the archive named `rabbitmq-server-4.2.0.tar.xz`