Skip to content

Commit 7893297

Browse files
committed
rabbit_peer_discovery: Retry rpc calls
1 parent 167934e commit 7893297

File tree

1 file changed

+40
-7
lines changed

1 file changed

+40
-7
lines changed

deps/rabbit/src/rabbit_peer_discovery.erl

Lines changed: 40 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -380,6 +380,8 @@ check_discovered_nodes_list_validity(DiscoveredNodes, BadNodeType)
380380
%%
381381
%% @private
382382

383+
-define(PEER_PARENT_NODE_KEY(Peer), {?MODULE, peer_parent_node, Peer}).
384+
383385
query_node_props(Nodes) when Nodes =/= [] ->
384386
ThisNode = node(),
385387
{Prefix, Suffix} = rabbit_nodes_common:parts(ThisNode),
@@ -444,13 +446,21 @@ query_node_props(Nodes) when Nodes =/= [] ->
444446
[Peer],
445447
#{domain => ?RMQLOG_DOMAIN_PEER_DISC}),
446448
try
449+
_ = peer:call(
450+
Pid,
451+
persistent_term, put,
452+
[?PEER_PARENT_NODE_KEY(Peer), ThisNode]),
453+
447454
NodesAndProps1 = (
448455
do_query_node_props([ThisNode]) ++
449456
peer:call(
450457
Pid, ?MODULE, do_query_node_props, [Nodes], 180000)),
451458
NodesAndProps2 = sort_nodes_and_props(NodesAndProps1),
452459
NodesAndProps2
453460
after
461+
_ = peer:call(
462+
Pid,
463+
persistent_term, erase, [?PEER_PARENT_NODE_KEY(Peer)]),
454464
peer:stop(Pid)
455465
end;
456466
{error, _} = Error ->
@@ -579,7 +589,12 @@ do_query_node_props(Nodes) when Nodes =/= [] ->
579589

580590
%% TODO: Replace with `rabbit_nodes:list_members/0' when the oldest
581591
%% supported version has it.
582-
MembersPerNode = erpc:multicall(Nodes, rabbit_nodes, all, []),
592+
MembersPerNode = [try
593+
{ok, erpc_call(Node, rabbit_nodes, all, [])}
594+
catch
595+
Class:Reason ->
596+
{Class, Reason}
597+
end || Node <- Nodes],
583598
query_node_props1(Nodes, MembersPerNode, []).
584599

585600
query_node_props1(
@@ -588,7 +603,7 @@ query_node_props1(
588603
NodesAndProps1 = [NodeAndProps | NodesAndProps],
589604
query_node_props1(Nodes, MembersPerNode, NodesAndProps1);
590605
query_node_props1(
591-
[Node | Nodes], [{error, _} = Error | MembersPerNode], NodesAndProps) ->
606+
[Node | Nodes], [{_, _} = Error | MembersPerNode], NodesAndProps) ->
592607
%% We consider that an error means the remote node is unreachable or not
593608
%% ready. Therefore, we exclude it from the list of discovered nodes as we
594609
%% won't be able to join it anyway.
@@ -604,7 +619,7 @@ query_node_props1([], [], NodesAndProps) ->
604619

605620
query_node_props2([{Node, Members} | Rest], NodesAndProps) ->
606621
try
607-
erpc:call(
622+
erpc_call(
608623
Node, logger, debug,
609624
["Peer discovery: temporary hidden node '~ts' queries properties "
610625
"from node '~ts'", [node(), Node]]),
@@ -656,10 +671,10 @@ query_node_props2([], NodesAndProps) ->
656671
%% @private
657672

658673
get_node_start_time(Node, Unit) ->
659-
NativeStartTime = erpc:call(Node, erlang, system_info, [start_time]),
660-
TimeOffset = erpc:call(Node, erlang, time_offset, []),
674+
NativeStartTime = erpc_call(Node, erlang, system_info, [start_time]),
675+
TimeOffset = erpc_call(Node, erlang, time_offset, []),
661676
SystemStartTime = NativeStartTime + TimeOffset,
662-
StartTime = erpc:call(
677+
StartTime = erpc_call(
663678
Node, erlang, convert_time_unit,
664679
[SystemStartTime, native, Unit]),
665680
StartTime.
@@ -679,12 +694,30 @@ is_node_db_ready(Node) when Node =:= node() ->
679694
true;
680695
is_node_db_ready(Node) ->
681696
try
682-
erpc:call(Node, rabbit_db, is_init_finished, [])
697+
erpc_call(Node, rabbit_db, is_init_finished, [])
683698
catch
684699
_:{exception, undef, [{rabbit_db, is_init_finished, _, _} | _]} ->
685700
undefined
686701
end.
687702

703+
erpc_call(Node, Mod, Fun, Args) ->
704+
try
705+
erpc:call(Node, Mod, Fun, Args)
706+
catch
707+
error:{erpc, Reason}:Stacktrace ->
708+
Peer = node(),
709+
ParentNode = persistent_term:get(
710+
?PEER_PARENT_NODE_KEY(Peer), Peer),
711+
_ = catch erpc:call(
712+
ParentNode,
713+
logger, alert, % XXX
714+
["Peer discovery: temporary hidden node '~ts' failed to connect to '~ts': ~0p",
715+
[Peer, Node, Reason]]),
716+
erlang:raise(error, {erpc, Reason}, Stacktrace)
717+
%timer:sleep(200),
718+
%erpc_call(Node, Mod, Fun, Args)
719+
end.
720+
688721
-spec sort_nodes_and_props(NodesAndProps) ->
689722
SortedNodesAndProps when
690723
NodesAndProps :: [node_and_props()],

0 commit comments

Comments
 (0)