Skip to content

Commit 3dd8a24

Browse files
committed
rabbit_peer_discovery: Cache computed node start time
[Why] The computation of a node's system-time-based start time is based on its monotonic-time-based start time plus the current time offset. However, since Erlang/OTP 26, that time offset is volatile by default. Therefore the computed system-time-based start time can change too. It happens in CI and this affects the seed node selection in the peer discovery code: because the sorting is unstable when the start time is the criteria, different instances of the peen discovery code could select a different node. [How] The same code is used to compute the start time. However, the computed value is cached in a persistent_term for future queries. This way we ensure that the value is computed once and thus all queries will return the same result.
1 parent 2291579 commit 3dd8a24

File tree

1 file changed

+80
-13
lines changed

1 file changed

+80
-13
lines changed

deps/rabbit/src/rabbit_peer_discovery.erl

Lines changed: 80 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,8 @@
2626
normalize/1,
2727
append_node_prefix/1,
2828
node_prefix/0]).
29-
-export([do_query_node_props/2]).
29+
-export([do_query_node_props/2,
30+
get_local_start_time/0]).
3031

3132
-ifdef(TEST).
3233
-export([query_node_props/1,
@@ -614,7 +615,7 @@ query_node_props2([{Node, Members} | Rest], NodesAndProps, FromNode) ->
614615
"queries properties from node '~ts'",
615616
[node(), Node]], FromNode),
616617
StartTime = get_node_start_time(
617-
Node, microsecond, FromNode),
618+
Node, FromNode),
618619
IsReady = is_node_db_ready(Node, FromNode),
619620
NodeAndProps = {Node, Members, StartTime, IsReady},
620621
NodesAndProps1 = [NodeAndProps | NodesAndProps],
@@ -642,9 +643,8 @@ query_node_props2([], NodesAndProps, _FromNode) ->
642643
?assert(length(NodesAndProps1) =< length(nodes(hidden))),
643644
NodesAndProps1.
644645

645-
-spec get_node_start_time(Node, Unit, FromNode) -> StartTime when
646+
-spec get_node_start_time(Node, FromNode) -> StartTime when
646647
Node :: node(),
647-
Unit :: erlang:time_unit(),
648648
FromNode :: node(),
649649
StartTime :: non_neg_integer().
650650
%% @doc Returns the start time of the given `Node' in `Unit'.
@@ -663,17 +663,84 @@ query_node_props2([], NodesAndProps, _FromNode) ->
663663
%% https://www.erlang.org/doc/man/erlang#time_offset-0 to get the full
664664
%% explanation of the computation.
665665
%%
666+
%% We also cache the computed start time. The reason is that since Erlang 26,
667+
%% the time offset its volatile by default. Therefore we may compute a
668+
%% different value each time. Caching the computed value gives up stability
669+
%% again.
670+
%%
666671
%% @private
667672

668-
get_node_start_time(Node, Unit, FromNode) ->
669-
NativeStartTime = erpc_call(
670-
Node, erlang, system_info, [start_time], FromNode),
671-
TimeOffset = erpc_call(Node, erlang, time_offset, [], FromNode),
672-
SystemStartTime = NativeStartTime + TimeOffset,
673-
StartTime = erpc_call(
674-
Node, erlang, convert_time_unit,
675-
[SystemStartTime, native, Unit], FromNode),
676-
StartTime.
673+
-define(START_TIME_KEY, rabbitmq_node_start_time).
674+
-define(START_TIME_UNIT, microsecond).
675+
676+
get_node_start_time(Node, FromNode) ->
677+
try
678+
erpc_call(Node, ?MODULE, get_local_start_time, [], FromNode)
679+
catch
680+
error:{exception, undef, [{?MODULE, _, _, _} | _]} ->
681+
get_node_start_time__v1(Node, FromNode)
682+
end.
683+
684+
get_node_start_time__v1(Node, FromNode) ->
685+
LockId = {?START_TIME_KEY, self()},
686+
true = global:set_lock(LockId, [Node]),
687+
try
688+
CachedStartTime = erpc_call(
689+
Node, persistent_term, get,
690+
[?START_TIME_KEY, undefined], FromNode),
691+
case CachedStartTime of
692+
undefined ->
693+
NativeStartTime = erpc_call(
694+
Node, erlang, system_info,
695+
[start_time], FromNode),
696+
TimeOffset = erpc_call(
697+
Node, erlang, time_offset, [], FromNode),
698+
SystemStartTime = NativeStartTime + TimeOffset,
699+
StartTime = erpc_call(
700+
Node, erlang, convert_time_unit,
701+
[SystemStartTime, native, ?START_TIME_UNIT],
702+
FromNode),
703+
erpc_call(
704+
Node, persistent_term, put, [?START_TIME_KEY, StartTime],
705+
FromNode),
706+
StartTime;
707+
_ ->
708+
CachedStartTime
709+
end
710+
after
711+
global:del_lock(LockId, [Node])
712+
end.
713+
714+
%% The function below is exactly the same code as above but with meant to run
715+
%% locally. This allows this node to compute its start time and cache it in
716+
%% advance.
717+
718+
get_local_start_time() ->
719+
Node = node(),
720+
LockId = {?START_TIME_KEY, self()},
721+
true = global:set_lock(LockId, [Node]),
722+
try
723+
CachedStartTime = persistent_term:get(?START_TIME_KEY, undefined),
724+
case CachedStartTime of
725+
undefined ->
726+
NativeStartTime = erlang:system_info(start_time),
727+
TimeOffset = erlang:time_offset(),
728+
SystemStartTime = NativeStartTime + TimeOffset,
729+
StartTime = erlang:convert_time_unit(
730+
SystemStartTime, native, ?START_TIME_UNIT),
731+
732+
?LOG_DEBUG(
733+
"Peer discovery: cache node start time (~b)",
734+
[StartTime],
735+
#{domain => ?RMQLOG_DOMAIN_PEER_DISC}),
736+
persistent_term:put(?START_TIME_KEY, StartTime),
737+
StartTime;
738+
_ ->
739+
CachedStartTime
740+
end
741+
after
742+
global:del_lock(LockId, [Node])
743+
end.
677744

678745
-spec is_node_db_ready(Node, FromNode) -> IsReady when
679746
Node :: node(),

0 commit comments

Comments
 (0)