Skip to content

Commit bee9a42

Browse files
Merge pull request #1347 from rabbitmq/autoheal-deadlock
Link process responsible of restart during autoheal, and abort if needed
2 parents b14868e + cc2f919 commit bee9a42

File tree

2 files changed

+35
-13
lines changed

2 files changed

+35
-13
lines changed

src/rabbit_autoheal.erl

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
-module(rabbit_autoheal).
1818

1919
-export([init/0, enabled/0, maybe_start/1, rabbit_down/2, node_down/2,
20-
handle_msg/3]).
20+
handle_msg/3, process_down/2]).
2121

2222
%% The named process we are running in.
2323
-define(SERVER, rabbit_node_monitor).
@@ -196,6 +196,16 @@ node_down(Node, _State) ->
196196
rabbit_log:info("Autoheal: aborting - ~p went down~n", [Node]),
197197
not_healing.
198198

199+
%% If the process that has to restart the node crashes for an unexpected reason,
200+
%% we go back to a not healing state so the node is able to recover.
201+
process_down({'EXIT', Pid, Reason}, {restarting, Pid}) when Reason =/= normal ->
202+
rabbit_log:info("Autoheal: aborting - the process responsible for restarting the "
203+
"node terminated with reason: ~p~n", [Reason]),
204+
not_healing;
205+
206+
process_down(_, State) ->
207+
State.
208+
199209
%% By receiving this message we become the leader
200210
%% TODO should we try to debounce this?
201211
handle_msg({request_start, Node},
@@ -252,17 +262,19 @@ handle_msg({become_winner, _},
252262
handle_msg({winner_is, Winner}, State = not_healing,
253263
_Partitions) ->
254264
%% This node is a loser, nothing else.
255-
restart_loser(State, Winner),
256-
restarting;
265+
Pid = restart_loser(State, Winner),
266+
{restarting, Pid};
257267
handle_msg({winner_is, Winner}, State = {leader_waiting, Winner, _},
258268
_Partitions) ->
259269
%% This node is the leader and a loser at the same time.
260-
restart_loser(State, Winner),
261-
restarting;
270+
Pid = restart_loser(State, Winner),
271+
{restarting, Pid};
262272

263-
handle_msg(_, restarting, _Partitions) ->
273+
handle_msg(Request, {restarting, Pid} = St, _Partitions) ->
264274
%% ignore, we can contribute no further
265-
restarting;
275+
rabbit_log:info("Autoheal: Received the request ~p while waiting for ~p "
276+
"to restart the node. Ignoring it ~n", [Request, Pid]),
277+
St;
266278

267279
handle_msg(report_autoheal_status, not_healing, _Partitions) ->
268280
%% The leader is asking about the autoheal status to us (the

src/rabbit_node_monitor.erl

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -623,6 +623,10 @@ handle_info(ping_up_nodes, State) ->
623623
[cast(N, keepalive) || N <- alive_nodes() -- [node()]],
624624
{noreply, ensure_keepalive_timer(State#state{keepalive_timer = undefined})};
625625

626+
handle_info({'EXIT', _, _} = Info, State = #state{autoheal = AState0}) ->
627+
AState = rabbit_autoheal:process_down(Info, AState0),
628+
{noreply, State#state{autoheal = AState}};
629+
626630
handle_info(_Info, State) ->
627631
{noreply, State}.
628632

@@ -686,12 +690,18 @@ await_cluster_recovery(Condition) ->
686690
ok.
687691

688692
run_outside_applications(Fun, WaitForExistingProcess) ->
689-
spawn(fun () ->
690-
%% If our group leader is inside an application we are about
691-
%% to stop, application:stop/1 does not return.
692-
group_leader(whereis(init), self()),
693-
register_outside_app_process(Fun, WaitForExistingProcess)
694-
end).
693+
spawn_link(fun () ->
694+
%% Ignore exit messages from the monitor - the link is needed
695+
%% to ensure the monitor detects abnormal exits from this process
696+
%% and can reset the 'restarting' status on the autoheal, avoiding
697+
%% a deadlock. The monitor is restarted when rabbit does, so messages
698+
%% in the other direction should be ignored.
699+
process_flag(trap_exit, true),
700+
%% If our group leader is inside an application we are about
701+
%% to stop, application:stop/1 does not return.
702+
group_leader(whereis(init), self()),
703+
register_outside_app_process(Fun, WaitForExistingProcess)
704+
end).
695705

696706
register_outside_app_process(Fun, WaitForExistingProcess) ->
697707
%% Ensure only one such process at a time, the exit(badarg) is

0 commit comments

Comments
 (0)