From 8ce543a99692ee6cb64f1f19b4432b4ec197346b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jean-S=C3=A9bastien=20P=C3=A9dron?= Date: Wed, 27 Nov 2024 16:24:02 +0100 Subject: [PATCH] rabbit_quorum_queue: Wait for member add in `add_member/4` [Why] The `ra:member_add/3` call returns before the change is committed. This is ok for that addition but any follow-up changes to the cluster might be rejected with the `cluster_change_not_permitted` error. [How] Instead of changing other places to wait or retry their cluster membership change, this patch waits for the current add to be applied before proceeding and returning. This fixes some transient failures in CI where such follow-up changes are rejected and not retried, leaving the cluster in an unexpected state for the testcase. An example is with `quorum_queue_SUITE:force_shrink_member_to_current_member/1` (cherry picked from commit 99d8e90df3237ea395850d7154a88e4b609324dd) --- deps/rabbit/src/rabbit_quorum_queue.erl | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/deps/rabbit/src/rabbit_quorum_queue.erl b/deps/rabbit/src/rabbit_quorum_queue.erl index 0672e234fe6e..c59c8d8be09c 100644 --- a/deps/rabbit/src/rabbit_quorum_queue.erl +++ b/deps/rabbit/src/rabbit_quorum_queue.erl @@ -1346,7 +1346,7 @@ add_member(Q, Node, Membership, Timeout) when ?amqqueue_is_quorum(Q) -> maps:get(id, Conf) end, case ra:add_member(Members, ServerIdSpec, Timeout) of - {ok, _, Leader} -> + {ok, {RaIndex, RaTerm}, Leader} -> Fun = fun(Q1) -> Q2 = update_type_state( Q1, fun(#{nodes := Nodes} = Ts) -> @@ -1354,6 +1354,19 @@ add_member(Q, Node, Membership, Timeout) when ?amqqueue_is_quorum(Q) -> end), amqqueue:set_pid(Q2, Leader) end, + %% The `ra:member_add/3` call above returns before the + %% change is committed. This is ok for that addition but + %% any follow-up changes to the cluster might be rejected + %% with the `cluster_change_not_permitted` error. + %% + %% Instead of changing other places to wait or retry their + %% cluster membership change, we wait for the current add + %% to be applied using a conditional leader query before + %% proceeding and returning. + {ok, _, _} = ra:leader_query( + Leader, + {erlang, is_list, []}, + #{condition => {applied, {RaIndex, RaTerm}}}), _ = rabbit_amqqueue:update(QName, Fun), rabbit_log:info("Added a replica of quorum ~ts on node ~ts", [rabbit_misc:rs(QName), Node]), ok;