|
17 | 17 | -module(rabbit_autoheal). |
18 | 18 |
|
19 | 19 | -export([init/0, enabled/0, maybe_start/1, rabbit_down/2, node_down/2, |
20 | | - handle_msg/3]). |
| 20 | + handle_msg/3, process_down/2]). |
21 | 21 |
|
22 | 22 | %% The named process we are running in. |
23 | 23 | -define(SERVER, rabbit_node_monitor). |
@@ -196,6 +196,16 @@ node_down(Node, _State) -> |
196 | 196 | rabbit_log:info("Autoheal: aborting - ~p went down~n", [Node]), |
197 | 197 | not_healing. |
198 | 198 |
|
| 199 | +%% If the process that has to restart the node crashes for an unexpected reason, |
| 200 | +%% we go back to a not healing state so the node is able to recover. |
| 201 | +process_down({'EXIT', Pid, Reason}, {restarting, Pid}) when Reason =/= normal -> |
| 202 | + rabbit_log:info("Autoheal: aborting - the process responsible for restarting the " |
| 203 | + "node terminated with reason: ~p~n", [Reason]), |
| 204 | + not_healing; |
| 205 | + |
| 206 | +process_down(_, State) -> |
| 207 | + State. |
| 208 | + |
199 | 209 | %% By receiving this message we become the leader |
200 | 210 | %% TODO should we try to debounce this? |
201 | 211 | handle_msg({request_start, Node}, |
@@ -252,17 +262,19 @@ handle_msg({become_winner, _}, |
252 | 262 | handle_msg({winner_is, Winner}, State = not_healing, |
253 | 263 | _Partitions) -> |
254 | 264 | %% This node is a loser, nothing else. |
255 | | - restart_loser(State, Winner), |
256 | | - restarting; |
| 265 | + Pid = restart_loser(State, Winner), |
| 266 | + {restarting, Pid}; |
257 | 267 | handle_msg({winner_is, Winner}, State = {leader_waiting, Winner, _}, |
258 | 268 | _Partitions) -> |
259 | 269 | %% This node is the leader and a loser at the same time. |
260 | | - restart_loser(State, Winner), |
261 | | - restarting; |
| 270 | + Pid = restart_loser(State, Winner), |
| 271 | + {restarting, Pid}; |
262 | 272 |
|
263 | | -handle_msg(_, restarting, _Partitions) -> |
| 273 | +handle_msg(Request, {restarting, Pid} = St, _Partitions) -> |
264 | 274 | %% ignore, we can contribute no further |
265 | | - restarting; |
| 275 | + rabbit_log:info("Autoheal: Received the request ~p while waiting for ~p " |
| 276 | + "to restart the node. Ignoring it ~n", [Request, Pid]), |
| 277 | + St; |
266 | 278 |
|
267 | 279 | handle_msg(report_autoheal_status, not_healing, _Partitions) -> |
268 | 280 | %% The leader is asking about the autoheal status to us (the |
|
0 commit comments