Skip to content

Commit 0b84490

Browse files
author
Daniil Fedotov
committed
Do not crash on vhost recovery failure if vhost strategy is continue
When a node is starting or restarting it can fail to recover a vhost. The `continue` vhost recovery strategy means that vhosts can ba down if cannot be recovered. This behaviour should also work for node startup and it should skip failing vhosts.
1 parent 049c902 commit 0b84490

File tree

2 files changed

+66
-19
lines changed

2 files changed

+66
-19
lines changed

src/rabbit_vhost.erl

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,8 +55,9 @@ recover() ->
5555
%% rabbit_vhost_sup_sup will start the actual recovery.
5656
%% So recovery will be run every time a vhost supervisor is restarted.
5757
ok = rabbit_vhost_sup_sup:start(),
58-
[{ok, _} = rabbit_vhost_sup_sup:vhost_sup(VHost)
59-
|| VHost <- rabbit_vhost:list()],
58+
59+
[ ok = rabbit_vhost_sup_sup:init_vhost(VHost)
60+
|| VHost <- rabbit_vhost:list()],
6061
ok.
6162

6263
recover(VHost) ->

src/rabbit_vhost_sup_sup.erl

Lines changed: 63 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -23,15 +23,15 @@
2323
-export([init/1]).
2424

2525
-export([start_link/0, start/0]).
26-
-export([vhost_sup/1, vhost_sup/2, save_vhost_sup/3]).
26+
-export([init_vhost/1, vhost_sup/1, vhost_sup/2, save_vhost_sup/3]).
2727
-export([delete_on_all_nodes/1]).
2828
-export([start_on_all_nodes/1]).
2929

3030
-export([save_vhost_process/2]).
3131
-export([is_vhost_alive/1]).
3232

3333
%% Internal
34-
-export([stop_and_delete_vhost/1]).
34+
-export([stop_and_delete_vhost/1, start_vhost/1]).
3535

3636
-record(vhost_sup, {vhost, vhost_sup_pid, wrapper_pid, vhost_process_pid}).
3737

@@ -59,8 +59,13 @@ init([]) ->
5959
[rabbit_vhost_sup_wrapper, rabbit_vhost_sup]}]}}.
6060

6161
start_on_all_nodes(VHost) ->
62-
[ {ok, _} = vhost_sup(VHost, Node) || Node <- rabbit_nodes:all_running() ],
63-
ok.
62+
NodesStart = [ {Node, start_vhost(VHost, Node)}
63+
|| Node <- rabbit_nodes:all_running() ],
64+
Failures = lists:filter(fun({_, ok}) -> false; (_) -> true end, NodesStart),
65+
case Failures of
66+
[] -> ok;
67+
Errors -> {error, {failed_to_start_vhost_on_nodes, Errors}}
68+
end.
6469

6570
delete_on_all_nodes(VHost) ->
6671
[ stop_and_delete_vhost(VHost, Node) || Node <- rabbit_nodes:all_running() ],
@@ -101,9 +106,31 @@ stop_and_delete_vhost(VHost, Node) ->
101106
{error, RpcErr}
102107
end.
103108

109+
-spec init_vhost(rabbit_types:vhost()) -> ok.
110+
init_vhost(VHost) ->
111+
case start_vhost(VHost) of
112+
{ok, _} -> ok;
113+
{error, {no_such_vhost, VHost}} ->
114+
{error, {no_such_vhost, VHost}};
115+
{error, Reason} ->
116+
case vhost_restart_strategy() of
117+
permanent ->
118+
rabbit_log:error(
119+
"Unable to initialize vhost data store for vhost '~s'."
120+
" Reason: ~p",
121+
[VHost, Reason]),
122+
throw({error, Reason});
123+
transient ->
124+
rabbit_log:warning(
125+
"Unable to initialize vhost data store for vhost '~s'."
126+
" The vhost will be stopped for this node. "
127+
" Reason: ~p",
128+
[VHost, Reason]),
129+
ok
130+
end
131+
end.
132+
104133
-spec vhost_sup(rabbit_types:vhost(), node()) -> {ok, pid()} | {error, {no_such_vhost, rabbit_types:vhost()} | term()}.
105-
vhost_sup(VHost, Local) when Local == node(self()) ->
106-
vhost_sup(VHost);
107134
vhost_sup(VHost, Node) ->
108135
case rabbit_misc:rpc_call(Node, rabbit_vhost_sup_sup, vhost_sup, [VHost]) of
109136
{ok, Pid} when is_pid(Pid) ->
@@ -114,23 +141,42 @@ vhost_sup(VHost, Node) ->
114141

115142
-spec vhost_sup(rabbit_types:vhost()) -> {ok, pid()} | {error, {no_such_vhost, rabbit_types:vhost()}}.
116143
vhost_sup(VHost) ->
144+
case vhost_sup_pid(VHost) of
145+
no_pid ->
146+
case start_vhost(VHost) of
147+
ok ->
148+
true = is_vhost_alive(VHost),
149+
ok;
150+
{error, {no_such_vhost, VHost}} ->
151+
{error, {no_such_vhost, VHost}};
152+
Error ->
153+
throw(Error)
154+
end;
155+
{ok, Pid} when is_pid(Pid) ->
156+
{ok, Pid}
157+
end.
158+
159+
-spec start_vhost(rabbit_types:vhost(), node()) -> {ok, pid()} | {error, term()}.
160+
start_vhost(VHost, Node) ->
161+
case rabbit_misc:rpc_call(Node, rabbit_vhost_sup_sup, start_vhost, [VHost]) of
162+
{ok, Pid} when is_pid(Pid) ->
163+
{ok, Pid};
164+
{badrpc, RpcErr} ->
165+
{error, RpcErr}
166+
end.
167+
168+
-spec start_vhost(rabbit_types:vhost()) -> {ok, pid()} | {error, term()}.
169+
start_vhost(VHost) ->
117170
case rabbit_vhost:exists(VHost) of
118171
false -> {error, {no_such_vhost, VHost}};
119172
true ->
120-
case vhost_sup_pid(VHost) of
121-
no_pid ->
122-
case supervisor2:start_child(?MODULE, [VHost]) of
123-
{ok, _} -> ok;
124-
{error, {already_started, _}} -> ok;
125-
Error -> throw(Error)
126-
end,
127-
{ok, _} = vhost_sup_pid(VHost);
128-
{ok, Pid} when is_pid(Pid) ->
129-
{ok, Pid}
173+
case supervisor2:start_child(?MODULE, [VHost]) of
174+
{ok, Pid} -> {ok, Pid};
175+
{error, {already_started, Pid}} -> {ok, Pid};
176+
{error, Err} -> {error, Err}
130177
end
131178
end.
132179

133-
134180
-spec is_vhost_alive(rabbit_types:vhost()) -> boolean().
135181
is_vhost_alive(VHost) ->
136182
%% A vhost is considered alive if it's supervision tree is alive and

0 commit comments

Comments
 (0)