Skip to content

Commit 7781829

Browse files
Ra metrics fixes and release notes (#14739) (#14759)
* Return the correct Ra metrics per endpoint /metrics returns all coordination system metrics and some high-level QQ values /metrics/per-object returns all coordination metrics and selected metrics for quorum queues /metrics/detailed * if no vhost filter provided, returns all per-object metrics for all queues as well as all coordination system metrics * if a vhost filter is provided, returns all per-object metrics for all queues in that vhost, but doesn't return coordination metrics * release notes: document Ra Metrics -> Ra Counters transition (cherry picked from commit e9baade) Co-authored-by: Michal Kuratczyk <[email protected]>
1 parent c6b71a9 commit 7781829

File tree

3 files changed

+133
-17
lines changed

3 files changed

+133
-17
lines changed

deps/rabbitmq_prometheus/src/collectors/prometheus_rabbitmq_raft_metrics_collector.erl

Lines changed: 41 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -53,8 +53,8 @@ collect_mf(_Registry, Callback) ->
5353
%% INTERNAL
5454

5555
collect_aggregate_metrics(Prefix, Callback) ->
56-
collect_max_values(Prefix, Callback),
57-
collect_key_component_metrics(Prefix, Callback).
56+
collect_key_component_metrics(Prefix, Callback),
57+
collect_max_values(Prefix, Callback).
5858

5959
collect_per_object_metrics(Prefix, Callback) ->
6060
collect_key_component_metrics(Prefix, Callback),
@@ -71,8 +71,6 @@ collect_detailed_metrics(Prefix, Callback) ->
7171
false
7272
end
7373
end,
74-
75-
collect_key_component_metrics(Prefix, Callback),
7674
collect_all_matching_metrics(Prefix, Callback, VHostFilterFun).
7775

7876
collect_key_per_object_metrics(Prefix, Callback) ->
@@ -91,7 +89,10 @@ collect_key_per_object_metrics(Prefix, Callback) ->
9189
Type,
9290
Values))
9391
end,
94-
seshat:format(ra, #{labels => as_binary, metrics => QQMetrics})).
92+
seshat:format(ra,
93+
#{labels => as_binary,
94+
metrics => QQMetrics,
95+
filter_fun => fun onlyQueues/1})).
9596

9697
collect_all_matching_metrics(Prefix, Callback, VHostFilterFun) ->
9798
maps:foreach(
@@ -106,7 +107,10 @@ collect_all_matching_metrics(Prefix, Callback, VHostFilterFun) ->
106107
Type,
107108
Values))
108109
end,
109-
seshat:format(ra, #{labels => as_binary, metrics => all, filter_fun => VHostFilterFun})).
110+
seshat:format(ra,
111+
#{labels => as_binary,
112+
metrics => all,
113+
filter_fun => VHostFilterFun})).
110114

111115
collect_max_values(Prefix, Callback) ->
112116
%% max values for QQ metrics
@@ -115,20 +119,26 @@ collect_max_values(Prefix, Callback) ->
115119
%% rabbitmq_raft_num_segments{queue="q2",vhost="/"} 10.0
116120
%% becomes
117121
%% rabbitmq_raft_max_num_segments 10.0
118-
QQMetrics = [num_segments],
122+
QQMetrics = [num_segments, commit_latency],
119123
maps:foreach(
120124
fun(Name, #{type := Type, help := Help, values := Values}) ->
121125
Max = lists:max(maps:values(Values)),
122126
Callback(
123127
create_mf(<<Prefix/binary, "max_", (prometheus_model_helpers:metric_name(Name))/binary>>,
124128
Help,
125129
Type,
126-
#{#{} => Max}))
130+
%% TODO: this should not be hardcoded, we should
131+
%% something more like 'max() GROUP BY ra_system'
132+
#{#{ra_system => quorum_queues} => Max}))
127133

128134
end,
129-
seshat:format(ra, #{labels => as_binary, metrics => QQMetrics})).
135+
seshat:format(ra,
136+
#{labels => as_binary,
137+
metrics => QQMetrics,
138+
filter_fun => fun onlyQueues/1})).
130139

131140
collect_key_component_metrics(Prefix, Callback) ->
141+
%% quorum queue metrics
132142
WALMetrics = [wal_files, bytes_written, mem_tables],
133143
SegmentWriterMetrics = [entries, segments],
134144
maps:foreach(
@@ -139,4 +149,25 @@ collect_key_component_metrics(Prefix, Callback) ->
139149
Type,
140150
Values))
141151
end,
142-
seshat:format(ra, #{labels => as_binary, metrics => WALMetrics ++ SegmentWriterMetrics})).
152+
seshat:format(ra,
153+
#{labels => as_binary,
154+
metrics => WALMetrics ++ SegmentWriterMetrics,
155+
filter_fun => fun onlyQueues/1})),
156+
%% Khepri and other coordination metrics
157+
maps:foreach(
158+
fun(Name, #{type := Type, help := Help, values := Values}) ->
159+
Callback(
160+
create_mf(<<Prefix/binary, (prometheus_model_helpers:metric_name(Name))/binary>>,
161+
Help,
162+
Type,
163+
Values))
164+
end,
165+
seshat:format(ra,
166+
#{labels => as_binary,
167+
filter_fun => fun onlyCoordinationSystem/1})).
168+
169+
onlyCoordinationSystem(#{ra_system := coordination}) -> true;
170+
onlyCoordinationSystem(_) -> false.
171+
172+
onlyQueues(#{queue := _}) -> true;
173+
onlyQueues(_) -> false.

deps/rabbitmq_prometheus/test/rabbit_prometheus_http_SUITE.erl

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -411,7 +411,8 @@ aggregated_metrics_test(Config) ->
411411
?assertEqual(match, re:run(Body, "^rabbitmq_io_read_time_seconds_total ", [{capture, none}, multiline])),
412412
%% Check the first TOTALS metric value
413413
?assertEqual(match, re:run(Body, "^rabbitmq_connections ", [{capture, none}, multiline])),
414-
?assertEqual(nomatch, re:run(Body, "^rabbitmq_raft_commit_latency_seconds", [{capture, none}, multiline])),
414+
?assertEqual(match, re:run(Body, "^rabbitmq_raft_commit_latency_seconds", [{capture, none}, multiline])),
415+
?assertEqual(match, re:run(Body, "^rabbitmq_raft_max_commit_latency_seconds", [{capture, none}, multiline])),
415416
?assertEqual(match, re:run(Body, "^rabbitmq_raft_bytes_written.*ra_log_segment_writer", [{capture, none}, multiline])),
416417
?assertEqual(match, re:run(Body, "^rabbitmq_raft_bytes_written.*ra_log_wal", [{capture, none}, multiline])),
417418
?assertEqual(match, re:run(Body, "^rabbitmq_raft_entries{", [{capture, none}, multiline])),
@@ -874,16 +875,10 @@ detailed_raft_metrics_test(Config) ->
874875
QQMetrics = #{#{queue => "a_quorum_queue", vhost => "/"} => ["1.0"]},
875876

876877
{_, Body1} = http_get_with_pal(Config, "/metrics/detailed?family=ra_metrics&vhost=foo", [], 200),
877-
%% no queues in vhost foo, so no QQ metrics
878-
?assertEqual(ComponentMetrics,
879-
map_get(rabbitmq_detailed_raft_wal_files, parse_response(Body1))),
880878
?assertEqual(undefined,
881879
maps:get(rabbitmq_detailed_raft_term, parse_response(Body1), undefined)),
882880

883881
{_, Body2} = http_get_with_pal(Config, "/metrics/detailed?family=ra_metrics&vhost=/", [], 200),
884-
%% there's a queue in vhost /
885-
?assertEqual(ComponentMetrics,
886-
map_get(rabbitmq_detailed_raft_wal_files, parse_response(Body2))),
887882
?assertEqual(QQMetrics,
888883
map_get(rabbitmq_detailed_raft_term, parse_response(Body2))),
889884

release-notes/4.2.0.md

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,14 @@ In other words, if the responder publishes to only this queue name, then the mes
2828
`*.cacerts` (not to be confused with `cacertfile`) settings in `rabbitmq.conf` did not have the expected effect and were removed
2929
to eliminate confusion.
3030

31+
### Quorum Queue Metric Changes
32+
33+
Metrics emitted for Ra-based components (quorum queues, Khepri, Stream Coordinator)
34+
have changed. Some metrics were removed, many were added, some changed their names.
35+
Users relying on Prometheus metrics starting with `rabbitmq_raft` or `rabbitmq_detailed_raft`
36+
will need to update their dashboards and/or alerts. If you are using the
37+
[RabbitMQ-Quorum-Queues-Raft dashboard](https://grafana.com/grafana/dashboards/11340-rabbitmq-quorum-queues-raft/),
38+
please update it to the latest version for RabbitMQ 4.2 compatibility.
3139

3240
## Release Highlights
3341

@@ -407,6 +415,88 @@ compared to other versions.
407415
* `cuttlefish` was upgraded to [`3.5.0`](https://github.com/kyorai/cuttlefish/releases)
408416

409417

418+
## Ra Metric Changes
419+
420+
Metrics emitted for Ra-based components (quorum queues, Khepri, Stream Coordinator)
421+
have changed. Some metrics were removed, many were added, some changed their names.
422+
For most users this should not require any action. However, users relying on Prometheus
423+
metrics starting with `rabbitmq_raft` or `rabbitmq_detailed_raft` will need to update
424+
their dashboards and/or alerts. If you are using the
425+
[RabbitMQ-Quorum-Queues-Raft dashboard](https://grafana.com/grafana/dashboards/11340-rabbitmq-quorum-queues-raft/),
426+
please update it to the latest version for RabbitMQ 4.2 compatibility.
427+
428+
#### More Accurate and Detailed Ra Metrics
429+
430+
Ra is an internal component implementing the Raft protocol. It's the basis
431+
for quorum queues, as well as some internal components (currently Khepri
432+
and Stream Coordinator). For quite some time, Ra metrics were tracked in two places
433+
but RabbitMQ relied on the old metric subsystem. In RabbitMQ 4.2, the old
434+
Ra metrics subsystem has been removed and RabbitMQ now reports Ra metrics
435+
from the new subsystem (implemented using [Seshat](https://github.com/rabbitmq/seshat) library).
436+
This migration has the following benefits:
437+
438+
* lower overhead, since only one subsystem is used
439+
* more up-to-date information - the old subsystem was only refreshed every 5 seconds,
440+
the new subsystem always returns the latest values
441+
* additional metrics are exposed, making it easier to debug the system if necessary
442+
443+
### Aggregated metrics (/metrics endpoint)
444+
445+
* `rabbitmq_raft_num_segments` was added; it reports the number of segment files of the internal components
446+
447+
* `rabbitmq_raft_max_num_segments` was added; it reports the highest number of segment
448+
files of any of the quorum queues; per-object metrics can be used to find which queue
449+
has a high number of segment files
450+
451+
* `rabbitmq_raft_term_total` has been removed
452+
this metric was emitted accidentally as a side effect of metric aggregation;
453+
the sum of Raft terms across all Raft clusters is a meaningless number
454+
455+
* some metrics contained the `_log_` substring in their name, even though they are not related to the Raft log;
456+
hence, they were renamed to avoid the misleading part:
457+
* `rabbitmq_raft_log_snapshot_index` -> `rabbitmq_raft_snapshot_index`
458+
* `rabbitmq_raft_log_last_applied_index` -> `rabbitmq_raft_last_applied`
459+
* `rabbitmq_raft_log_commit_index` -> `rabbitmq_raft_commit_index`
460+
* `rabbitmq_raft_log_last_written_index` -> `rabbitmq_raft_last_written_index`
461+
462+
* `rabbitmq_raft_entry_commit_latency_seconds` has been removed; it was an average latency across all Ra clusters
463+
in all Ra systems (RabbitMQ currently uses two separate Ra systems: one for quorum queues and one for internal
464+
components, currently Khepri and Stream Coordinator); it was therefore not very useful, since different
465+
components can have very different latencies
466+
467+
* `rabbitmq_raft_commit_latency_seconds` was added; in case of aggregated metrics, it is only reported for
468+
internal components (currently Khepri and Stream Coordinator)
469+
470+
* `rabbitmq_raft_max_commit_latency_seconds` has been added; it's the highest commit latency reported by any
471+
of the quorum queues. When it's high, per-object can be used to find which specific queue reports high commit latency
472+
473+
### Per-object metrics (/metrics/per-object endpoint)
474+
475+
More metrics are reported for each queue than in older versions.
476+
477+
Incorrect metric names were corrected as described above.
478+
479+
Additionally:
480+
* `rabbitmq_raft_term_total` has been renamed to `rabbitmq_raft_term` (the "total" suffix
481+
was incorrect and misleading, since the metrics is reported for each specific Ra cluster)
482+
483+
* `rabbitmq_raft_num_segments` was added; it reports the number of segment files of the internal components
484+
and for each quorum queue
485+
486+
### Detailed metrics (/metrics/detailed endpoint)
487+
488+
When the detailed endpoints is scraped with `family=ra_metrics` parameter,
489+
more metrics are reported for each queue than in older versions.
490+
491+
Incorrect metric names were corrected as described above.
492+
493+
Additionally:
494+
* `rabbitmq_raft_term_total` has been renamed to `rabbitmq_raft_term` (the "total" suffix
495+
was incorrect and misleading, since the metrics is reported for each specific Ra cluster)
496+
497+
* `rabbitmq_raft_num_segments` was added; it reports the number of segment files of the internal components
498+
and for each quorum queue
499+
410500
## Source Code Archives
411501

412502
To obtain source code of the entire distribution, please download the archive named `rabbitmq-server-4.2.0.tar.xz`

0 commit comments

Comments
 (0)