From 2c514a9e337a658bf5d9c69d3c7d23863642386d Mon Sep 17 00:00:00 2001 From: Michael Davis Date: Wed, 11 Sep 2024 13:26:36 -0400 Subject: [PATCH] rabbit_khepri: Retry fence in init/1 in cases of timeout (cherry picked from commit 3afb379f0e53106cfe14d5d58f1c5f803befea68) --- deps/rabbit/src/rabbit_khepri.erl | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/deps/rabbit/src/rabbit_khepri.erl b/deps/rabbit/src/rabbit_khepri.erl index d8f35e990fba..f6a84a6afcac 100644 --- a/deps/rabbit/src/rabbit_khepri.erl +++ b/deps/rabbit/src/rabbit_khepri.erl @@ -289,6 +289,12 @@ retry_timeout() -> undefined -> 30000 end. +retry_limit() -> + case application:get_env(rabbit, khepri_leader_wait_retry_limit) of + {ok, T} -> T; + undefined -> 10 + end. + %% @private -spec init(IsVirgin) -> Ret when @@ -305,11 +311,7 @@ init(IsVirgin) -> "Found the following metadata store members: ~p", [Members], #{domain => ?RMQLOG_DOMAIN_DB}), maybe - ?LOG_DEBUG( - "Khepri-based " ?RA_FRIENDLY_NAME " catching up on " - "replication to the Raft cluster leader", [], - #{domain => ?RMQLOG_DOMAIN_DB}), - ok ?= fence(retry_timeout()), + ok ?= await_replication(), ?LOG_DEBUG( "local Khepri-based " ?RA_FRIENDLY_NAME " member is caught " "up to the Raft cluster leader", [], @@ -331,6 +333,24 @@ init(IsVirgin) -> end end. +await_replication() -> + await_replication(retry_timeout(), retry_limit()). + +await_replication(_Timeout, 0) -> + {error, timeout}; +await_replication(Timeout, Retries) -> + ?LOG_DEBUG( + "Khepri-based " ?RA_FRIENDLY_NAME " waiting to catch up on replication " + "to the Raft cluster leader. Waiting for ~tb ms, ~tb retries left", + [Timeout, Retries], + #{domain => ?RMQLOG_DOMAIN_DB}), + case fence(Timeout) of + ok -> + ok; + {error, timeout} -> + await_replication(Timeout, Retries -1) + end. + %% @private can_join_cluster(DiscoveryNode) when is_atom(DiscoveryNode) ->