Skip to content

Commit a5fed7b

Browse files
committed
rabbit_node_monitor: Notify rabbit is live when handling a nodeup message
[Why] So far, when there was a network partition with Mnesia, the most popular partition handling strategies restarted RabbitMQ nodes. Therefore, `rabbit` would execute the boot steps and one of them would notify other members of the cluster that "this RabbitMQ node is live". With Khepri, nodes are not restarted anymore and thus, boot steps are not executed at the end of a network partition. As a consequence, other members are not notified that a member is back online. [How] When the node monitor receives the `nodeup` message (managed by Erlang, meaning that "a remote Erlang node just connected to this node through Erlang distribution"), a `node_up` message is sent to all cluster members (meaning "RabbitMQ is now running on the originating node"). Yeah, very poor naming... This lets the RabbitMQ node monitor know when other nodes running RabbitMQ are back online and react accordingly. If a node is restarted, it means that another node could receive the `node_up` message twice. The actions behind it must be idempotent.
1 parent a09383d commit a5fed7b

File tree

1 file changed

+22
-10
lines changed

1 file changed

+22
-10
lines changed

deps/rabbit/src/rabbit_node_monitor.erl

Lines changed: 22 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -430,16 +430,8 @@ handle_call(status, _From, State = #state{partitions = Partitions}) ->
430430
handle_call(_Request, _From, State) ->
431431
{noreply, State}.
432432

433-
handle_cast(notify_node_up, State = #state{guid = GUID}) ->
434-
Nodes = rabbit_nodes:list_reachable() -- [node()],
435-
gen_server:abcast(Nodes, ?SERVER,
436-
{node_up, node(), rabbit_db_cluster:node_type(), GUID}),
437-
%% register other active rabbits with this rabbit
438-
DiskNodes = rabbit_db_cluster:disc_members(),
439-
[gen_server:cast(?SERVER, {node_up, N, case lists:member(N, DiskNodes) of
440-
true -> disc;
441-
false -> ram
442-
end}) || N <- Nodes],
433+
handle_cast(notify_node_up, State) ->
434+
do_notify_node_up(State),
443435
{noreply, State};
444436

445437
%%----------------------------------------------------------------------------
@@ -665,6 +657,12 @@ handle_info({nodedown, Node, Info}, State) ->
665657

666658
handle_info({nodeup, Node, _Info}, State) ->
667659
?LOG_INFO("node ~tp up", [Node]),
660+
%% We notify that `rabbit' is up here too (in addition to the message sent
661+
%% explicitly by a boot step. That's because nodes may go down then up
662+
%% during a network partition, and with Khepri, nodes are not restarted
663+
%% (unlike with some partition handling strategies used with Mnesia), and
664+
%% thus the boot steps are not executed.
665+
do_notify_node_up(State),
668666
{noreply, State};
669667

670668
handle_info({mnesia_system_event,
@@ -854,6 +852,20 @@ wait_for_cluster_recovery(Condition) ->
854852
wait_for_cluster_recovery(Condition)
855853
end.
856854

855+
do_notify_node_up(#state{guid = GUID}) ->
856+
Nodes = rabbit_nodes:list_reachable() -- [node()],
857+
gen_server:abcast(Nodes, ?SERVER,
858+
{node_up, node(), rabbit_db_cluster:node_type(), GUID}),
859+
%% register other active rabbits with this rabbit
860+
DiskNodes = rabbit_db_cluster:disc_members(),
861+
_ = [gen_server:cast(
862+
?SERVER,
863+
{node_up, N, case lists:member(N, DiskNodes) of
864+
true -> disc;
865+
false -> ram
866+
end}) || N <- Nodes],
867+
ok.
868+
857869
handle_dead_rabbit(Node, State) ->
858870
%% TODO: This may turn out to be a performance hog when there are
859871
%% lots of nodes. We really only need to execute some of these

0 commit comments

Comments
 (0)