rabbit_khepri: "fence" during init/1

the-mikedavis · the-mikedavis · commit 6870cf112143 · 2024-09-05T16:52:06.000-04:00
`khepri:fence/0,1,2` queries the leader's Raft index and blocks the
caller for the given (or default) timeout until the local member has
caught up in log replication to that index. We want to do this during
Khepri init to ensure that the local Khepri store is reasonably up to
date before continuing in the boot process and starting listeners.

This is conceptually similar to the call to `mnesia:wait_for_tables/2`
during `rabbit_mnesia:init/0` and should have the same effect.
diff --git a/deps/rabbit/src/rabbit_khepri.erl b/deps/rabbit/src/rabbit_khepri.erl
@@ -301,24 +301,30 @@ init(IsVirgin) ->
             ?LOG_NOTICE(
                "Found the following metadata store members: ~p", [Members],
                #{domain => ?RMQLOG_DOMAIN_DB}),
-            Ret = case IsVirgin of
-                      true ->
-                          register_projections();
-                      false ->
-                          ok
-                  end,
-            case Ret of
-                ok ->
-                    %% Delete transient queues on init.
-                    %% Note that we also do this in the
-                    %% `rabbit_amqqueue:on_node_down/1' callback. We must try
-                    %% this deletion during init because the cluster may have
-                    %% been in a minority when this node went down. We wait for
-                    %% a majority while registering projections above
-                    %% though so this deletion is likely to succeed.
-                    rabbit_amqqueue:delete_transient_queues_on_node(node());
-                {error, _} = Error ->
-                    Error
+            maybe
+                ?LOG_DEBUG(
+                   "Khepri-based " ?RA_FRIENDLY_NAME " catching up on "
+                   "replication to the Raft cluster leader", [],
+                   #{domain => ?RMQLOG_DOMAIN_DB}),
+                ok ?= fence(retry_timeout()),
+                ?LOG_DEBUG(
+                   "local Khepri-based " ?RA_FRIENDLY_NAME " member is caught "
+                   "up to the Raft cluster leader", [],
+                   #{domain => ?RMQLOG_DOMAIN_DB}),
+                ok ?= case IsVirgin of
+                          true ->
+                              register_projections();
+                          false ->
+                              ok
+                      end,
+                %% Delete transient queues on init.
+                %% Note that we also do this in the
+                %% `rabbit_amqqueue:on_node_down/1' callback. We must try this
+                %% deletion during init because the cluster may have been in a
+                %% minority when this node went down. We wait for a majority
+                %% while registering projections above though so this deletion
+                %% is likely to succeed.
+                rabbit_amqqueue:delete_transient_queues_on_node(node())
             end
     end.
 
@@ -1044,6 +1050,9 @@ info() ->
 handle_async_ret(RaEvent) ->
     khepri:handle_async_ret(?STORE_ID, RaEvent).
 
+fence(Timeout) ->
+    khepri:fence(?STORE_ID, Timeout).
+
 %% -------------------------------------------------------------------
 %% collect_payloads().
 %% -------------------------------------------------------------------