Skip to content

Commit 1969ae4

Browse files
committed
WIP Add AMQP observability
Add session -> exchange metrics for unroutable_returned and unroutable_dropped Add session -> exchange metrics for accepted WIP Add seesion <-> queue stats Single ETS session table Expose per session metrics in Prometheus Add 'rabbitmqctl list_sessions' CLI command list_channels queries into each channel proc. This commit decides that list_sessions reads the local ETS table instead. Each node sends its session infos directly to the CLI node to avoid large amounts of data being transferred acrosss RabbitMQ nodes. Advantages: * Same code path is used for Prometheus, Management UI, and CLI because they all query the same single source of truth: ETS table `session_metrics`. * Avoid waking up potentially hundreds of thousands of processes at the same time. Disadvantages: * Data is slightly old because the session emits stats every interval (5 seconds by default). But this shouldn't matter for this CLI command.
1 parent dbd9ede commit 1969ae4

File tree

9 files changed

+712
-146
lines changed

9 files changed

+712
-146
lines changed

deps/rabbit/src/rabbit_amqp_session.erl

Lines changed: 440 additions & 97 deletions
Large diffs are not rendered by default.

deps/rabbit_common/include/rabbit_core_metrics.hrl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,8 @@
2626

2727
-define(CORE_EXTRA_TABLES, [{gen_server2_metrics, set},
2828
{auth_attempt_metrics, set},
29-
{auth_attempt_detailed_metrics, set}]).
29+
{auth_attempt_detailed_metrics, set},
30+
{session_metrics, set}]).
3031

3132
% `CORE_NON_CHANNEL_TABLES` are tables that store counters representing the
3233
% same info as some of the channel_queue_metrics, channel_exchange_metrics and

deps/rabbit_common/src/rabbit_core_metrics.erl

Lines changed: 128 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,14 @@
1818
connection_stats/2,
1919
connection_stats/4]).
2020

21+
-export([session_begun/1,
22+
session_ended/0,
23+
session_pids/0,
24+
session_infos/1,
25+
session_stats/1
26+
% session_stats/4
27+
]).
28+
2129
-export([channel_created/2,
2230
channel_closed/1,
2331
channel_stats/2,
@@ -38,6 +46,8 @@
3846
queue_deleted/1,
3947
queues_deleted/1]).
4048

49+
-export([exchange_stats/3]).
50+
4151
-export([node_stats/2]).
4252

4353
-export([node_node_stats/2]).
@@ -54,18 +64,36 @@
5464
get_auth_attempts/0,
5565
get_auth_attempts_by_source/0]).
5666

67+
% -define(QUEUE_METRICS_DEFAULT(QName),
68+
% %% Last field is delete marker.
69+
% {QName, 0, 0, 0, 0, 0, 0, 0, 0}).
70+
71+
% -define(SESSION_EXCHANGE_METRICS_DEFAULT(Key),
72+
% %% Last field is delete marker.
73+
% {Key, 0, 0, 0, 0, 0}).
74+
75+
% -define(SESSION_QUEUE_METRICS_DEFAULT(Key),
76+
% %% Last field is delete marker.
77+
% {Key, 0, 0, 0, 0, 0, 0, 0, 0}).
78+
5779
%%----------------------------------------------------------------------------
5880
%% Types
5981
%%----------------------------------------------------------------------------
60-
-type(channel_stats_id() :: pid() |
61-
{pid(),
62-
{rabbit_types:rabbit_amqqueue_name(), rabbit_types:exchange_name()}} |
63-
{pid(), rabbit_types:rabbit_amqqueue_name()} |
64-
{pid(), rabbit_types:exchange_name()}).
82+
83+
% -type(session_stats_key() :: {pid(), rabbit_types:r(exchange | queue)}).
84+
% -type(session_stats_type() :: exchange_stats | queue_stats).
85+
86+
-type(channel_stats_id() ::
87+
pid() |
88+
{pid(),
89+
{rabbit_types:rabbit_amqqueue_name(), rabbit_types:exchange_name()} |
90+
rabbit_types:r(exchange | queue)}).
6591

6692
-type(channel_stats_type() :: queue_exchange_stats | queue_stats |
6793
exchange_stats | reductions).
6894

95+
-type(exchange_operation() :: publish | confirm | return_unroutable | drop_unroutable).
96+
6997
-type(activity_status() :: up | single_active | waiting | suspected_down).
7098
%%----------------------------------------------------------------------------
7199
%% Specs
@@ -107,8 +135,7 @@
107135
%%----------------------------------------------------------------------------
108136

109137
create_table({Table, Type}) ->
110-
ets:new(Table, [Type, public, named_table, {write_concurrency, true},
111-
{read_concurrency, true}]).
138+
ets:new(Table, [Type, public, named_table, {write_concurrency, true}]).
112139

113140
init() ->
114141
Tables = ?CORE_TABLES ++ ?CORE_EXTRA_TABLES ++ ?CORE_NON_CHANNEL_TABLES,
@@ -146,6 +173,92 @@ connection_stats(Pid, Recv_oct, Send_oct, Reductions) ->
146173
ets:insert(connection_coarse_metrics, {Pid, Recv_oct, Send_oct, Reductions, 0}),
147174
ok.
148175

176+
-spec session_begun(rabbit_types:infos()) -> ok.
177+
session_begun(ImmutableInfos) ->
178+
ets:insert(session_metrics, {self(), ImmutableInfos, _MutableInfos = []}),
179+
ok.
180+
181+
-spec session_ended() -> ok.
182+
session_ended() ->
183+
ets:delete(session_metrics, self()),
184+
ok.
185+
186+
-spec session_pids() -> [pid()].
187+
session_pids() ->
188+
lists:map(fun([Pid]) ->
189+
Pid
190+
end, ets:match(session_metrics, {'$1', '_', '_'})).
191+
192+
-spec session_infos(rabbit_types:info_keys()) -> [rabbit_types:infos()].
193+
session_infos(Items) ->
194+
lists:map(fun({Pid, ImmutableInfos, MutableInfos}) ->
195+
Infos = maps:from_list([{pid, Pid}] ++ ImmutableInfos ++ MutableInfos),
196+
lists:map(fun(Item) ->
197+
{Item, maps:get(Item, Infos)}
198+
end, Items)
199+
end, ets:tab2list(session_metrics)).
200+
201+
-spec session_stats(rabbit_types:infos()) -> ok.
202+
session_stats(MutableInfos) ->
203+
ets:update_element(session_metrics, self(), {3, MutableInfos}),
204+
ok.
205+
206+
% -spec session_stats(session_stats_type(), atom(), session_stats_key(), pos_integer()) -> ok.
207+
% session_stats(exchange_stats, publish, {_SessionPid, XName} = Key, Value) ->
208+
% _ = ets:update_counter(session_exchange_metrics, Key, {2, Value}, ?SESSION_EXCHANGE_METRICS_DEFAULT(Key)),
209+
% _ = ets:update_counter(exchange_metrics, XName, {2, Value}, ?EXCHANGE_METRICS_DEFAULT(XName)),
210+
% ok;
211+
% session_stats(exchange_stats, accept, {_SessionPid, XName} = Key, Value) ->
212+
% _ = ets:update_counter(session_exchange_metrics, Key, {3, Value}, ?SESSION_EXCHANGE_METRICS_DEFAULT(Key)),
213+
% _ = ets:update_counter(exchange_metrics, XName, {3, Value}, ?EXCHANGE_METRICS_DEFAULT(XName)),
214+
% ok;
215+
% session_stats(exchange_stats, return_unroutable, {_SessionPid, XName} = Key, Value) ->
216+
% _ = ets:update_counter(session_exchange_metrics, Key, {4, Value}, ?SESSION_EXCHANGE_METRICS_DEFAULT(Key)),
217+
% _ = ets:update_counter(exchange_metrics, XName, {4, Value}, ?EXCHANGE_METRICS_DEFAULT(XName)),
218+
% ok;
219+
% session_stats(exchange_stats, drop_unroutable, {_SessionPid, XName} = Key, Value) ->
220+
% _ = ets:update_counter(session_exchange_metrics, Key, {5, Value}, ?SESSION_EXCHANGE_METRICS_DEFAULT(Key)),
221+
% _ = ets:update_counter(exchange_metrics, XName, {5, Value}, ?EXCHANGE_METRICS_DEFAULT(XName)),
222+
% ok;
223+
224+
% session_stats(queue_stats, deliver_unsettled, {_SessionPid, QName} = Key, Value) ->
225+
% _ = ets:update_counter(session_queue_metrics, Key, {2, Value}, ?SESSION_QUEUE_METRICS_DEFAULT(Key)),
226+
% _ = ets:update_counter(queue_delivery_metrics, QName, {4, Value}, ?QUEUE_METRICS_DEFAULT(QName)),
227+
% ok;
228+
% session_stats(queue_stats, deliver_settled, {_SessionPid, QName} = Key, Value) ->
229+
% _ = ets:update_counter(session_queue_metrics, Key, {3, Value}, ?SESSION_QUEUE_METRICS_DEFAULT(Key)),
230+
% _ = ets:update_counter(queue_delivery_metrics, QName, {5, Value}, ?QUEUE_METRICS_DEFAULT(QName)),
231+
% ok;
232+
% session_stats(queue_stats, redeliver, {_SessionPid, QName} = Key, Value) ->
233+
% _ = ets:update_counter(session_queue_metrics, Key, {4, Value}, ?SESSION_QUEUE_METRICS_DEFAULT(Key)),
234+
% _ = ets:update_counter(queue_delivery_metrics, QName, {6, Value}, ?QUEUE_METRICS_DEFAULT(QName)),
235+
% ok;
236+
% session_stats(queue_stats, accept, {_SessionPid, QName} = Key, Value) ->
237+
% _ = ets:update_counter(session_queue_metrics, Key, {5, Value}, ?SESSION_QUEUE_METRICS_DEFAULT(Key)),
238+
% _ = ets:update_counter(queue_delivery_metrics, QName, {7, Value}, ?QUEUE_METRICS_DEFAULT(QName)),
239+
% ok;
240+
% session_stats(queue_stats, reject, {_SessionPid, _QName} = Key, Value) ->
241+
% _ = ets:update_counter(session_queue_metrics, Key, {6, Value}, ?SESSION_QUEUE_METRICS_DEFAULT(Key)),
242+
% ok;
243+
% session_stats(queue_stats, release, {_SessionPid, _QName} = Key, Value) ->
244+
% _ = ets:update_counter(session_queue_metrics, Key, {7, Value}, ?SESSION_QUEUE_METRICS_DEFAULT(Key)),
245+
% ok;
246+
% session_stats(queue_stats, modify, {_SessionPid, _QName} = Key, Value) ->
247+
% _ = ets:update_counter(session_queue_metrics, Key, {8, Value}, ?SESSION_QUEUE_METRICS_DEFAULT(Key)),
248+
% ok.
249+
250+
-spec exchange_stats(exchange_operation(), rabbit_types:exchange_name(), pos_integer()) -> ok.
251+
exchange_stats(Operation, XName, Incr) ->
252+
Pos = case Operation of
253+
publish -> 2;
254+
confirm -> 3;
255+
return_unroutable -> 4;
256+
drop_unroutable -> 5
257+
end,
258+
%% Last field is delete marker.
259+
_ = ets:update_counter(exchange_metrics, XName, {Pos, Incr}, {XName, 0, 0, 0, 0, 0}),
260+
ok.
261+
149262
channel_created(Pid, Infos) ->
150263
ets:insert(channel_created, {Pid, Infos}),
151264
_ = ets:update_counter(connection_churn_metrics, node(), {4, 1},
@@ -168,26 +281,22 @@ channel_stats(reductions, Id, Value) ->
168281
ets:insert(channel_process_metrics, {Id, Value}),
169282
ok.
170283

171-
channel_stats(exchange_stats, publish, {_ChannelPid, XName} = Id, Value) ->
284+
channel_stats(exchange_stats, Op = publish, {_ChannelPid, XName} = Id, Value) ->
172285
%% Includes delete marker
173286
_ = ets:update_counter(channel_exchange_metrics, Id, {2, Value}, {Id, 0, 0, 0, 0, 0}),
174-
_ = ets:update_counter(exchange_metrics, XName, {2, Value}, {XName, 0, 0, 0, 0, 0}),
175-
ok;
176-
channel_stats(exchange_stats, confirm, {_ChannelPid, XName} = Id, Value) ->
287+
exchange_stats(Op, XName, Value);
288+
channel_stats(exchange_stats, Op = confirm, {_ChannelPid, XName} = Id, Value) ->
177289
%% Includes delete marker
178290
_ = ets:update_counter(channel_exchange_metrics, Id, {3, Value}, {Id, 0, 0, 0, 0, 0}),
179-
_ = ets:update_counter(exchange_metrics, XName, {3, Value}, {XName, 0, 0, 0, 0, 0}),
180-
ok;
181-
channel_stats(exchange_stats, return_unroutable, {_ChannelPid, XName} = Id, Value) ->
291+
exchange_stats(Op, XName, Value);
292+
channel_stats(exchange_stats, Op = return_unroutable, {_ChannelPid, XName} = Id, Value) ->
182293
%% Includes delete marker
183294
_ = ets:update_counter(channel_exchange_metrics, Id, {4, Value}, {Id, 0, 0, 0, 0, 0}),
184-
_ = ets:update_counter(exchange_metrics, XName, {4, Value}, {XName, 0, 0, 0, 0, 0}),
185-
ok;
186-
channel_stats(exchange_stats, drop_unroutable, {_ChannelPid, XName} = Id, Value) ->
295+
exchange_stats(Op, XName, Value);
296+
channel_stats(exchange_stats, Op = drop_unroutable, {_ChannelPid, XName} = Id, Value) ->
187297
%% Includes delete marker
188298
_ = ets:update_counter(channel_exchange_metrics, Id, {5, Value}, {Id, 0, 0, 0, 0, 0}),
189-
_ = ets:update_counter(exchange_metrics, XName, {5, Value}, {XName, 0, 0, 0, 0, 0}),
190-
ok;
299+
exchange_stats(Op, XName, Value);
191300
channel_stats(queue_exchange_stats, publish, {_ChannelPid, QueueExchange} = Id, Value) ->
192301
%% Includes delete marker
193302
_ = ets:update_counter(channel_queue_exchange_metrics, Id, Value, {Id, 0, 0}),

deps/rabbit_common/src/rabbit_event.erl

Lines changed: 19 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,9 @@
1010
-include("rabbit.hrl").
1111

1212
-export([start_link/0]).
13-
-export([init_stats_timer/2, init_disabled_stats_timer/2,
13+
-export([init_stats_timer/0, init_stats_timer/2, init_disabled_stats_timer/2,
1414
ensure_stats_timer/3, stop_stats_timer/2, reset_stats_timer/2]).
15-
-export([stats_level/2, if_enabled/3]).
15+
-export([stats_level/1, stats_level/2, if_enabled/3]).
1616
-export([notify/2, notify/3, notify_if/3]).
1717
-export([sync_notify/2, sync_notify/3]).
1818

@@ -49,7 +49,6 @@
4949
-spec ensure_stats_timer(container(), pos(), term()) -> container().
5050
-spec stop_stats_timer(container(), pos()) -> container().
5151
-spec reset_stats_timer(container(), pos()) -> container().
52-
-spec stats_level(container(), pos()) -> level().
5352
-spec if_enabled(container(), pos(), timer_fun()) -> 'ok'.
5453
-spec notify(event_type(), event_props()) -> 'ok'.
5554
-spec notify(event_type(), event_props(), reference() | 'none') -> 'ok'.
@@ -89,12 +88,18 @@ start_link() ->
8988
%% Nowadays, instead of sending a message to rabbit_event via notify(stats),
9089
%% some stat-emitting objects update ETS tables directly via module rabbit_core_metrics.
9190

92-
init_stats_timer(C, P) ->
91+
-spec init_stats_timer() -> state().
92+
init_stats_timer() ->
9393
%% If the rabbit app is not loaded - use default none:5000
9494
StatsLevel = application:get_env(rabbit, collect_statistics, none),
95-
Interval = application:get_env(rabbit, collect_statistics_interval, 5000),
96-
setelement(P, C, #state{level = StatsLevel, interval = Interval,
97-
timer = undefined}).
95+
Interval = application:get_env(rabbit, collect_statistics_interval, 5000),
96+
#state{level = StatsLevel,
97+
interval = Interval,
98+
timer = undefined}.
99+
100+
init_stats_timer(C, P) ->
101+
State = init_stats_timer(),
102+
setelement(P, C, State).
98103

99104
init_disabled_stats_timer(C, P) ->
100105
setelement(P, C, #state{level = none, interval = 0, timer = undefined}).
@@ -128,10 +133,14 @@ reset_stats_timer(C, P) ->
128133
C
129134
end.
130135

131-
stats_level(C, P) ->
132-
#state{level = Level} = element(P, C),
136+
-spec stats_level(state()) -> level().
137+
stats_level(#state{level = Level}) ->
133138
Level.
134139

140+
-spec stats_level(container(), pos()) -> level().
141+
stats_level(C, P) ->
142+
stats_level(element(P, C)).
143+
135144
if_enabled(C, P, Fun) ->
136145
case element(P, C) of
137146
#state{level = none} -> ok;
@@ -156,5 +165,5 @@ event_cons(Type, Props, Ref) ->
156165
#event{type = Type,
157166
props = Props,
158167
reference = Ref,
159-
timestamp = os:system_time(milli_seconds)}.
168+
timestamp = os:system_time(millisecond)}.
160169

deps/rabbitmq_cli/lib/rabbitmq/cli/core/doc_guide.ex

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ defmodule RabbitMQ.CLI.Core.DocGuide do
3636
Macros.defguide("alarms")
3737
Macros.defguide("disk_alarms")
3838
Macros.defguide("alternate_exchange", path_segment: "ae")
39+
Macros.defguide("amqp")
3940
Macros.defguide("channels")
4041
Macros.defguide("cli")
4142
Macros.defguide("clustering")
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
## This Source Code Form is subject to the terms of the Mozilla Public
2+
## License, v. 2.0. If a copy of the MPL was not distributed with this
3+
## file, You can obtain one at https://mozilla.org/MPL/2.0/.
4+
##
5+
## Copyright (c) 2007-2023 Broadcom. All Rights Reserved. The term “Broadcom” refers to Broadcom Inc. and/or its subsidiaries. All rights reserved.
6+
##
7+
8+
defmodule RabbitMQ.CLI.Ctl.Commands.ListSessionsCommand do
9+
alias RabbitMQ.CLI.Core.{DocGuide, Helpers}
10+
alias RabbitMQ.CLI.Ctl.{InfoKeys, RpcStream}
11+
12+
@behaviour RabbitMQ.CLI.CommandBehaviour
13+
14+
def scopes(), do: [:ctl, :diagnostics]
15+
16+
@info_keys ~w(pid name connection_pid channel_number user vhost handle_max
17+
in_attach in_flow in_transfer in_disposition in_detach)a
18+
19+
def info_keys(), do: @info_keys
20+
21+
def merge_defaults([], opts) do
22+
merge_defaults(~w(pid name user), opts)
23+
end
24+
25+
def merge_defaults(args, opts) do
26+
{args, Map.merge(%{table_headers: true}, opts)}
27+
end
28+
29+
def validate(args, _) do
30+
case InfoKeys.validate_info_keys(args, @info_keys) do
31+
{:ok, _} -> :ok
32+
err -> err
33+
end
34+
end
35+
36+
use RabbitMQ.CLI.Core.RequiresRabbitAppRunning
37+
38+
def run([], opts) do
39+
run(~w(pid name user) |> Enum.map(&to_charlist/1), opts)
40+
end
41+
42+
def run([_ | _] = args, %{node: node_name, timeout: timeout}) do
43+
info_keys = InfoKeys.prepare_info_keys(args)
44+
broker_keys = InfoKeys.broker_keys(info_keys)
45+
46+
Helpers.with_nodes_in_cluster(node_name, fn nodes ->
47+
RpcStream.receive_list_items(
48+
node_name,
49+
:rabbit_amqp_session,
50+
:emit_info_all,
51+
[nodes, broker_keys],
52+
timeout,
53+
info_keys,
54+
Kernel.length(nodes)
55+
)
56+
end)
57+
end
58+
59+
use RabbitMQ.CLI.DefaultOutput
60+
61+
def formatter(), do: RabbitMQ.CLI.Formatters.Table
62+
63+
def banner(_, _), do: "Listing AMQP 1.0 sessions ..."
64+
65+
def usage() do
66+
"list_sessions [--no-table-headers] [<column> ...]"
67+
end
68+
69+
def usage_additional() do
70+
[
71+
["<column>", "must be one of " <> Enum.join(Enum.sort(@info_keys), ", ")]
72+
]
73+
end
74+
75+
def usage_doc_guides() do
76+
[
77+
DocGuide.amqp()
78+
]
79+
end
80+
81+
def help_section(), do: :observability_and_health_checks
82+
83+
def description(), do: "Lists all AMQP 1.0 sessions"
84+
end

deps/rabbitmq_mqtt/src/rabbit_mqtt_reader.erl

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@
3737
rabbit_mqtt_processor:state(),
3838
connection_state :: running | blocked,
3939
conserve :: boolean(),
40-
stats_timer :: option(rabbit_event:state()),
40+
stats_timer :: rabbit_event:state(),
4141
keepalive = rabbit_mqtt_keepalive:init() :: rabbit_mqtt_keepalive:state(),
4242
conn_name :: binary()
4343
}).
@@ -87,9 +87,9 @@ init(Ref) ->
8787
await_recv = false,
8888
connection_state = running,
8989
conserve = false,
90-
parse_state = rabbit_mqtt_packet:init_state()},
91-
State1 = control_throttle(State0),
92-
State = rabbit_event:init_stats_timer(State1, #state.stats_timer),
90+
parse_state = rabbit_mqtt_packet:init_state(),
91+
stats_timer = rabbit_event:init_stats_timer()},
92+
State = control_throttle(State0),
9393
gen_server:enter_loop(?MODULE, [], State);
9494
{error, Reason = enotconn} ->
9595
?LOG_INFO("MQTT could not get connection string: ~s", [Reason]),

0 commit comments

Comments
 (0)