rabbit_node_monitor: Notify rabbit is live when handling a nodeup message

dumbbell · dumbbell · commit a5fed7bb4d99 · 2025-09-29T15:43:08.000+02:00
[Why]
So far, when there was a network partition with Mnesia, the most popular
partition handling strategies restarted RabbitMQ nodes. Therefore,
`rabbit` would execute the boot steps and one of them would notify other
members of the cluster that "this RabbitMQ node is live".

With Khepri, nodes are not restarted anymore and thus, boot steps are
not executed at the end of a network partition. As a consequence, other
members are not notified that a member is back online.

[How]
When the node monitor receives the `nodeup` message (managed by Erlang,
meaning that "a remote Erlang node just connected to this node through
Erlang distribution"), a `node_up` message is sent to all cluster
members (meaning "RabbitMQ is now running on the originating node").
Yeah, very poor naming...

This lets the RabbitMQ node monitor know when other nodes running
RabbitMQ are back online and react accordingly.

If a node is restarted, it means that another node could receive the
`node_up` message twice. The actions behind it must be idempotent.
diff --git a/deps/rabbit/src/rabbit_node_monitor.erl b/deps/rabbit/src/rabbit_node_monitor.erl
@@ -430,16 +430,8 @@ handle_call(status, _From, State = #state{partitions = Partitions}) ->
 handle_call(_Request, _From, State) ->
     {noreply, State}.
 
-handle_cast(notify_node_up, State = #state{guid = GUID}) ->
-    Nodes = rabbit_nodes:list_reachable() -- [node()],
-    gen_server:abcast(Nodes, ?SERVER,
-                      {node_up, node(), rabbit_db_cluster:node_type(), GUID}),
-    %% register other active rabbits with this rabbit
-    DiskNodes = rabbit_db_cluster:disc_members(),
-    [gen_server:cast(?SERVER, {node_up, N, case lists:member(N, DiskNodes) of
-                                               true  -> disc;
-                                               false -> ram
-                                           end}) || N <- Nodes],
+handle_cast(notify_node_up, State) ->
+    do_notify_node_up(State),
     {noreply, State};
 
 %%----------------------------------------------------------------------------
@@ -665,6 +657,12 @@ handle_info({nodedown, Node, Info}, State) ->
 
 handle_info({nodeup, Node, _Info}, State) ->
     ?LOG_INFO("node ~tp up", [Node]),
+    %% We notify that `rabbit' is up here too (in addition to the message sent
+    %% explicitly by a boot step. That's because nodes may go down then up
+    %% during a network partition, and with Khepri, nodes are not restarted
+    %% (unlike with some partition handling strategies used with Mnesia), and
+    %% thus the boot steps are not executed.
+    do_notify_node_up(State),
     {noreply, State};
 
 handle_info({mnesia_system_event,
@@ -854,6 +852,20 @@ wait_for_cluster_recovery(Condition) ->
                  wait_for_cluster_recovery(Condition)
     end.
 
+do_notify_node_up(#state{guid = GUID}) ->
+    Nodes = rabbit_nodes:list_reachable() -- [node()],
+    gen_server:abcast(Nodes, ?SERVER,
+                      {node_up, node(), rabbit_db_cluster:node_type(), GUID}),
+    %% register other active rabbits with this rabbit
+    DiskNodes = rabbit_db_cluster:disc_members(),
+    _ = [gen_server:cast(
+           ?SERVER,
+           {node_up, N, case lists:member(N, DiskNodes) of
+                            true  -> disc;
+                            false -> ram
+                        end}) || N <- Nodes],
+    ok.
+
 handle_dead_rabbit(Node, State) ->
     %% TODO: This may turn out to be a performance hog when there are
     %% lots of nodes.  We really only need to execute some of these