Keep exclusive/auto-delete queues with Khepri + network partition

dumbbell · mergify[bot] · commit c8ac59946800 · 2025-10-01T14:38:10.000Z
[Why] With Mnesia, when the network partition strategy is set to `pause_minority`, nodes on the "minority side" are stopped. Thus, the exclusive queues that were hosted by nodes on that minority side are lost: * Consumers connected on these nodes are disconnected because the nodes are stopped. * Queue records on the majority side are deleted from the metadata store. This was ok with Mnesia and how this network partition handling strategy is implemented. However, it does not work with Khepri because the nodes on the "minority side" continue to run and serve clients. Therefore the cluster ends up in a weird situation: 1. The "majority side" deleted the queue records. 2. When the network partition is solved, the "minority side" gets the record deletion, but the queue processes continue to run. This was similar for auto-delete queues. [How] With Khepri, we stop to delete transient queue records in general, just because there is a node going down. Thanks to this, an exclusive or an auto-delete queue and its consumer(s) are not affected by a network partition: they continue to work. However, if a node is really lost, we need to clean up dead queue records. This was already done for durable queues with both Mnesia and Khepri. But with Khepri, transient queue records persist in the store like durable queue records (unlike with Mnesia). That's why this commit changes the clean-up function, `rabbit_amqqueue:forget_all_durable/1` into `rabbit_amqqueue:forget_all/1` which deletes all queue records of queues that were hosted on the given node, regardless if they are transient or durable. In addition to this, the queue process will spawn a temporary process who will try to delete the underlying record indefinitely if no other processes are waiting for a reply from the queue process. That's the case for queues that are deleted because of an internal event (like the exclusive/auto-delete conditions). The queue process will exit, which will notify connections that the queue is gone. Thanks to this, the temporary process will do its best to delete the record in case of a network partition, whether the consumers go away during or after that partition. That said, the node monitor drives some failsafe code that cleans up record if the queue process was killed before it could delete its own record. Fixes #12949, #12597, #14527. (cherry picked from commit 3c4d073)
diff --git a/deps/rabbit/src/rabbit_amqqueue.erl b/deps/rabbit/src/rabbit_amqqueue.erl
@@ -9,7 +9,7 @@
 
 -export([recover/1, stop/1, start/1, declare/6, declare/7,
          delete_immediately/1, delete_exclusive/2, delete/4, purge/1,
-         forget_all_durable/1]).
+         forget_all/1]).
 -export([pseudo_queue/2, pseudo_queue/3]).
 -export([exists/1, lookup/1, lookup/2, lookup_durable_queue/1,
          not_found_or_absent_dirty/1,
@@ -1882,19 +1882,19 @@ internal_delete(Queue, ActingUser, Reason) ->
                                       {user_who_performed_action, ActingUser}])
     end.
 
--spec forget_all_durable(node()) -> 'ok'.
+-spec forget_all(node()) -> 'ok'.
 
-%% TODO this is used by `rabbit_mnesia:remove_node_if_mnesia_running`
-%% Does it make any sense once mnesia is not used/removed?
-forget_all_durable(Node) ->
-    ?LOG_INFO("Will remove all classic queues from node ~ts. The node is likely being removed from the cluster.", [Node]),
+%% This is used by `rabbit_mnesia:remove_node_if_mnesia_running/1' and
+%% `rabbit_khepri:remove_*_member/1'.
+forget_all(Node) ->
+    ?LOG_INFO("Will remove all queues from node ~ts. The node is likely being removed from the cluster.", [Node]),
     UpdateFun = fun(Q) ->
                         forget_node_for_queue(Q)
                 end,
     FilterFun = fun(Q) ->
                         is_local_to_node(amqqueue:get_pid(Q), Node)
                 end,
-    rabbit_db_queue:foreach_durable(UpdateFun, FilterFun).
+    rabbit_db_queue:foreach(UpdateFun, FilterFun).
 
 forget_node_for_queue(Q)
   when ?amqqueue_is_quorum(Q) ->
@@ -1936,27 +1936,31 @@ is_dead_exclusive(Q) when ?amqqueue_exclusive_owner_is_pid(Q) ->
 
 -spec on_node_up(node()) -> 'ok'.
 
-on_node_up(_Node) ->
-    ok.
+on_node_up(Node) ->
+    case rabbit_khepri:is_enabled() of
+        true ->
+            %% With Khepri, we try to delete transient queues now because it's
+            %% possible any updates timed out because of the lack of a quorum
+            %% while `Node' was down.
+            ok = delete_transient_queues_on_node(Node);
+        false ->
+            ok
+    end.
 
 -spec on_node_down(node()) -> 'ok'.
 
 on_node_down(Node) ->
-    case delete_transient_queues_on_node(Node) of
-        ok ->
+    case rabbit_khepri:is_enabled() of
+        true ->
+            %% With Khepri, we don't delete transient/exclusive queues. There
+            %% may be a network partition and the node will be reachable again
+            %% after the partition is repaired.
+            %%
+            %% If the node will never come back, it will likely be removed from
+            %% the cluster. We take care of transient queues at that time.
             ok;
-        {error, timeout} ->
-            %% This case is possible when running Khepri. The node going down
-            %% could leave the cluster in a minority so the command to delete
-            %% the transient queue records would fail. Also see
-            %% `rabbit_khepri:init/0': we also try this deletion when the node
-            %% restarts - a time that the cluster is very likely to have a
-            %% majority - to ensure these records are deleted.
-            ?LOG_WARNING("transient queues for node '~ts' could not be "
-                               "deleted because of a timeout. These queues "
-                               "will be removed when node '~ts' restarts or "
-                               "is removed from the cluster.", [Node, Node]),
-            ok
+        false ->
+            ok = delete_transient_queues_on_node(Node)
     end.
 
 -spec delete_transient_queues_on_node(Node) -> Ret when
diff --git a/deps/rabbit/src/rabbit_amqqueue_process.erl b/deps/rabbit/src/rabbit_amqqueue_process.erl
@@ -352,21 +352,66 @@ terminate_delete(EmitStats, Reason0, ReplyTo,
                                                 fun() -> emit_stats(State) end);
            true      -> ok
         end,
-        %% This try-catch block transforms throws to errors since throws are not
-        %% logged. When mnesia is removed this `try` can be removed: Khepri
-        %% returns errors as error tuples instead.
-        Reply = try rabbit_amqqueue:internal_delete(Q, ActingUser, Reason0) of
-                    ok ->
-                        {ok, Len};
-                    {error, _} = Err ->
-                        Err
-                catch
-                    {error, ReasonE} -> error(ReasonE)
-                end,
-        send_reply(ReplyTo, Reply),
+        case ReplyTo of
+            _ when ReplyTo =/= none ->
+                Reply = case delete_queue_record(Q, ActingUser, Reason0) of
+                            ok ->
+                                {ok, Len};
+                            {error, _} = Err ->
+                                Err
+                        end,
+                send_reply(ReplyTo, Reply);
+            none ->
+                %% No processes are waiting for this queue process to exit. We
+                %% can handle the deletion of the queue record differently: if
+                %% the deletion times out, we retry indefinitely.
+                %%
+                %% For instance, this allows an auto-delete queue process to
+                %% wait and retry until a network partition is resolved (or
+                %% this node stops of course). This reduces the risk of a
+                %% "leak" of a queue record in the metadata store.
+                %%
+                %% If for whatever reason the queue record is still leaked
+                %% (this process could not delete it before it was killed), the
+                %% "leaked" queue record will be cleaned up when the partition
+                %% is solved (or this node is removed from the cluster).
+                %% Indeed, when the partition is solved, all nodes are notified
+                %% with the `node_up' message from `rabbit_node_monitor'. This
+                %% calls `rabbit_amqqueue:on_node_up/1' which will delete any
+                %% transient queues.
+                %%
+                %% This infinite delete attempts loop is executed in a
+                %% separate process to let this queue process exits. This way,
+                %% connections will be notified that the queue process is
+                %% gone.
+                worker_pool:submit_async(
+                  fun() ->
+                          _ = infinite_internal_delete(Q, ActingUser, Reason0)
+                  end),
+                ok
+        end,
         BQS1
     end.
 
+infinite_internal_delete(Q, ActingUser, Reason) ->
+    case delete_queue_record(Q, ActingUser, Reason) of
+        {error, timeout} ->
+            _ = rabbit_khepri:fence(infinity),
+            infinite_internal_delete(Q, ActingUser, Reason);
+        Ret ->
+            Ret
+    end.
+
+delete_queue_record(Q, ActingUser, Reason) ->
+    %% This try-catch block transforms throws to errors since throws are not
+    %% logged. When mnesia is removed this `try` can be removed: Khepri returns
+    %% errors as error tuples instead.
+    try
+        rabbit_amqqueue:internal_delete(Q, ActingUser, Reason)
+    catch
+        {error, ReasonE} -> error(ReasonE)
+    end.
+
 terminated_by({terminated_by, auto_delete}) ->
     ?INTERNAL_USER;
 terminated_by({terminated_by, ActingUser}) ->
diff --git a/deps/rabbit/src/rabbit_db_queue.erl b/deps/rabbit/src/rabbit_db_queue.erl
@@ -32,7 +32,8 @@
          delete/2,
          update/2,
          update_decorators/2,
-         exists/1
+         exists/1,
+         foreach/2
         ]).
 
 %% Once mnesia is removed, all transient entities will be deleted. These can be replaced
@@ -57,8 +58,7 @@
 %% Only used by rabbit_amqqueue:forget_node_for_queue, which is only called
 %% by `rabbit_mnesia:remove_node_if_mnesia_running`. Thus, once mnesia and/or
 %% HA queues are removed it can be deleted.
--export([foreach_durable/2,
-         internal_delete/3]).
+-export([internal_delete/3]).
 
 %% Storing it on Khepri is not needed, this function is just used in
 %% rabbit_quorum_queue to ensure the queue is present in the rabbit_queue
@@ -1263,20 +1263,26 @@ foreach_transient_in_khepri(UpdateFun) ->
     end.
 
 %% -------------------------------------------------------------------
-%% foreach_durable().
+%% foreach().
 %% -------------------------------------------------------------------
 
--spec foreach_durable(UpdateFun, FilterFun) -> ok when
+-spec foreach(UpdateFun, FilterFun) -> ok when
       UpdateFun :: fun((Queue) -> any()),
       FilterFun :: fun((Queue) -> boolean()).
-%% @doc Applies `UpdateFun' to all durable queue records that match `FilterFun'.
+%% @doc Applies `UpdateFun' to all queue records that match `FilterFun'.
+%%
+%% With Mnesia, only durable queues are considered because we use the durable
+%% queues table.
+%%
+%% With Khepri, all queues are considered because they are all in the same
+%% "table".
 %%
 %% @private
 
-foreach_durable(UpdateFun, FilterFun) ->
+foreach(UpdateFun, FilterFun) ->
     rabbit_khepri:handle_fallback(
       #{mnesia => fun() -> foreach_durable_in_mnesia(UpdateFun, FilterFun) end,
-        khepri => fun() -> foreach_durable_in_khepri(UpdateFun, FilterFun) end
+        khepri => fun() -> foreach_in_khepri(UpdateFun, FilterFun) end
        }).
 
 foreach_durable_in_mnesia(UpdateFun, FilterFun) ->
@@ -1292,11 +1298,8 @@ foreach_durable_in_mnesia(UpdateFun, FilterFun) ->
           end),
     ok.
 
-foreach_durable_in_khepri(UpdateFun, FilterFun) ->
-    Path = khepri_queue_path(
-             ?KHEPRI_WILDCARD_STAR,
-             #if_data_matches{
-                pattern = amqqueue:pattern_match_on_durable(true)}),
+foreach_in_khepri(UpdateFun, FilterFun) ->
+    Path = khepri_queue_path(?KHEPRI_WILDCARD_STAR, ?KHEPRI_WILDCARD_STAR),
     case rabbit_khepri:filter(Path, fun(_, #{data := Q}) ->
                                             FilterFun(Q)
                                     end) of
diff --git a/deps/rabbit/src/rabbit_khepri.erl b/deps/rabbit/src/rabbit_khepri.erl
@@ -670,7 +670,7 @@ remove_reachable_member(NodeToRemove) ->
             NodeToRemove, khepri_cluster, reset, [?RA_CLUSTER_NAME]),
     case Ret of
         ok ->
-            rabbit_amqqueue:forget_all_durable(NodeToRemove),
+            rabbit_amqqueue:forget_all(NodeToRemove),
             ?LOG_DEBUG(
                "Node ~s removed from Khepri cluster \"~s\"",
                [NodeToRemove, ?RA_CLUSTER_NAME],
@@ -692,7 +692,7 @@ remove_down_member(NodeToRemove) ->
     Ret = ra:remove_member(ServerRef, ServerId, Timeout),
     case Ret of
         {ok, _, _} ->
-            rabbit_amqqueue:forget_all_durable(NodeToRemove),
+            rabbit_amqqueue:forget_all(NodeToRemove),
             ?LOG_DEBUG(
                "Node ~s removed from Khepri cluster \"~s\"",
                [NodeToRemove, ?RA_CLUSTER_NAME],
diff --git a/deps/rabbit/src/rabbit_mnesia.erl b/deps/rabbit/src/rabbit_mnesia.erl
@@ -916,7 +916,7 @@ remove_node_if_mnesia_running(Node) ->
             case mnesia:del_table_copy(schema, Node) of
                 {atomic, ok} ->
                     rabbit_node_monitor:notify_left_cluster(Node),
-                    rabbit_amqqueue:forget_all_durable(Node),
+                    rabbit_amqqueue:forget_all(Node),
                     ok;
                 {aborted, Reason} ->
                     {error, {failed_to_remove_node, Node, Reason}}
diff --git a/deps/rabbit/test/bindings_SUITE.erl b/deps/rabbit/test/bindings_SUITE.erl
@@ -96,25 +96,36 @@ end_per_group(_, Config) ->
                                 rabbit_ct_broker_helpers:teardown_steps()).
 
 init_per_testcase(Testcase, Config) ->
-    Config1 = rabbit_ct_helpers:testcase_started(Config, Testcase),
-    rabbit_ct_broker_helpers:rpc(Config, 0, ?MODULE, delete_queues, []),
-    Name = rabbit_data_coercion:to_binary(Testcase),
-    rabbit_ct_broker_helpers:rpc(Config, 0, ?MODULE, delete_exchange, [Name]),
-    Config2 = rabbit_ct_helpers:set_config(Config1,
-                                           [{queue_name, Name},
-                                            {alt_queue_name, <<Name/binary, "_alt">>},
-                                            {exchange_name, Name}
-                                           ]),
-    rabbit_ct_helpers:run_steps(Config2, rabbit_ct_client_helpers:setup_steps()).
+    case {Testcase, rabbit_ct_broker_helpers:configured_metadata_store(Config)} of
+        {transient_queue_on_node_down, khepri} ->
+            {skip, "Test irrelevant with Khepri"};
+        _ ->
+            Config1 = rabbit_ct_helpers:testcase_started(Config, Testcase),
+            rabbit_ct_broker_helpers:rpc(Config, 0, ?MODULE, delete_queues, []),
+            Name = rabbit_data_coercion:to_binary(Testcase),
+            rabbit_ct_broker_helpers:rpc(Config, 0, ?MODULE, delete_exchange, [Name]),
+            Config2 = rabbit_ct_helpers:set_config(
+                        Config1,
+                        [{queue_name, Name},
+                         {alt_queue_name, <<Name/binary, "_alt">>},
+                         {exchange_name, Name}
+                        ]),
+            rabbit_ct_helpers:run_steps(Config2, rabbit_ct_client_helpers:setup_steps())
+    end.
 
 end_per_testcase(Testcase, Config) ->
-    rabbit_ct_broker_helpers:rpc(Config, 0, ?MODULE, delete_queues, []),
-    rabbit_ct_broker_helpers:rpc(Config, 0, ?MODULE, delete_exchange,
-                                 [?config(exchange_name, Config)]),
-    Config1 = rabbit_ct_helpers:run_steps(
-                Config,
-                rabbit_ct_client_helpers:teardown_steps()),
-    rabbit_ct_helpers:testcase_finished(Config1, Testcase).
+    case {Testcase, rabbit_ct_broker_helpers:configured_metadata_store(Config)} of
+        {transient_queue_on_node_down, khepri} ->
+            Config;
+        _ ->
+            rabbit_ct_broker_helpers:rpc(Config, 0, ?MODULE, delete_queues, []),
+            rabbit_ct_broker_helpers:rpc(Config, 0, ?MODULE, delete_exchange,
+                                         [?config(exchange_name, Config)]),
+            Config1 = rabbit_ct_helpers:run_steps(
+                        Config,
+                        rabbit_ct_client_helpers:teardown_steps()),
+            rabbit_ct_helpers:testcase_finished(Config1, Testcase)
+    end.
 
 %% -------------------------------------------------------------------
 %% Testcases.
diff --git a/deps/rabbit/test/clustering_recovery_SUITE.erl b/deps/rabbit/test/clustering_recovery_SUITE.erl
diff --git a/deps/rabbit/test/rabbit_db_queue_SUITE.erl b/deps/rabbit/test/rabbit_db_queue_SUITE.erl