Skip to content

Commit a04658d

Browse files
committed
Link process responsible of restart during autoheal, and abort if needed
If the process crashes unexpectedly, the autoheal process gets into a deadlock in the 'restarting' state, ignoring any new request from the winner. If the crash happens before the process is registered, no logs are generated. Thus, we need to link that process and abort the autoheal if the process finishes with a reason different from normal. rabbitmq-server#1346 [#150707017]
1 parent f2afe31 commit a04658d

File tree

2 files changed

+23
-7
lines changed

2 files changed

+23
-7
lines changed

src/rabbit_autoheal.erl

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
-module(rabbit_autoheal).
1818

1919
-export([init/0, enabled/0, maybe_start/1, rabbit_down/2, node_down/2,
20-
handle_msg/3]).
20+
handle_msg/3, process_down/2]).
2121

2222
%% The named process we are running in.
2323
-define(SERVER, rabbit_node_monitor).
@@ -196,6 +196,16 @@ node_down(Node, _State) ->
196196
rabbit_log:info("Autoheal: aborting - ~p went down~n", [Node]),
197197
not_healing.
198198

199+
%% If the process that has to restart the node crashes for an unexpected reason,
200+
%% we go back to a not healing state so the node is able to recover.
201+
process_down({'EXIT', Pid, Reason}, {restarting, Pid}) when Reason =/= normal ->
202+
rabbit_log:info("Autoheal: aborting - the process responsible for restarting the "
203+
"node terminated with reason: ~p~n", [Reason]),
204+
not_healing;
205+
206+
process_down(_, State) ->
207+
State.
208+
199209
%% By receiving this message we become the leader
200210
%% TODO should we try to debounce this?
201211
handle_msg({request_start, Node},
@@ -252,17 +262,19 @@ handle_msg({become_winner, _},
252262
handle_msg({winner_is, Winner}, State = not_healing,
253263
_Partitions) ->
254264
%% This node is a loser, nothing else.
255-
restart_loser(State, Winner),
256-
restarting;
265+
Pid = restart_loser(State, Winner),
266+
{restarting, Pid};
257267
handle_msg({winner_is, Winner}, State = {leader_waiting, Winner, _},
258268
_Partitions) ->
259269
%% This node is the leader and a loser at the same time.
260-
restart_loser(State, Winner),
261-
restarting;
270+
Pid = restart_loser(State, Winner),
271+
{restarting, Pid};
262272

263-
handle_msg(_, restarting, _Partitions) ->
273+
handle_msg(Request, {restarting, Pid} = St, _Partitions) ->
264274
%% ignore, we can contribute no further
265-
restarting;
275+
rabbit_log:info("Autoheal: Received the request ~p while waiting for ~p "
276+
"to restart the node. Ignoring it ~n", [Request, Pid]),
277+
St;
266278

267279
handle_msg(report_autoheal_status, not_healing, _Partitions) ->
268280
%% The leader is asking about the autoheal status to us (the

src/rabbit_node_monitor.erl

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -623,6 +623,10 @@ handle_info(ping_up_nodes, State) ->
623623
[cast(N, keepalive) || N <- alive_nodes() -- [node()]],
624624
{noreply, ensure_keepalive_timer(State#state{keepalive_timer = undefined})};
625625

626+
handle_info({'EXIT', _, _} = Info, State = #state{autoheal = AState0}) ->
627+
AState = rabbit_autoheal:process_down(Info, AState0),
628+
{noreply, State#state{autoheal = AState}};
629+
626630
handle_info(_Info, State) ->
627631
{noreply, State}.
628632

0 commit comments

Comments
 (0)