Skip to content

Commit 80e817f

Browse files
author
Tim Watson
committed
Backport 5e80230e2225 (Merge of bug25094; unclustering ram nodes broken)
1 parent 56e1b53 commit 80e817f

File tree

2 files changed

+51
-33
lines changed

2 files changed

+51
-33
lines changed

src/rabbit_mnesia.erl

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -726,40 +726,42 @@ reset(Force) ->
726726
end]),
727727
ensure_mnesia_not_running(),
728728
case not Force andalso is_clustered() andalso
729-
is_only_disc_node(node(), false)
729+
is_only_disc_node(node(), false)
730730
of
731731
true -> log_both("no other disc nodes running");
732732
false -> ok
733733
end,
734-
Node = node(),
735-
Nodes = all_clustered_nodes() -- [Node],
736734
case Force of
737-
true -> ok;
735+
true ->
736+
disconnect_nodes(nodes());
738737
false ->
739738
ensure_mnesia_dir(),
740739
start_mnesia(),
741-
RunningNodes =
740+
{Nodes, RunningNodes} =
742741
try
743742
%% Force=true here so that reset still works when clustered
744743
%% with a node which is down
745744
ok = init_db(read_cluster_nodes_config(), true),
746-
running_clustered_nodes() -- [Node]
745+
{all_clustered_nodes() -- [node()],
746+
running_clustered_nodes() -- [node()]}
747747
after
748748
stop_mnesia()
749749
end,
750750
leave_cluster(Nodes, RunningNodes),
751-
rabbit_misc:ensure_ok(mnesia:delete_schema([Node]),
752-
cannot_delete_schema)
751+
rabbit_misc:ensure_ok(mnesia:delete_schema([node()]),
752+
cannot_delete_schema),
753+
disconnect_nodes(Nodes)
753754
end,
754-
%% We need to make sure that we don't end up in a distributed
755-
%% Erlang system with nodes while not being in an Mnesia cluster
756-
%% with them. We don't handle that well.
757-
[erlang:disconnect_node(N) || N <- Nodes],
758755
ok = delete_cluster_nodes_config(),
759756
%% remove persisted messages and any other garbage we find
760757
ok = rabbit_file:recursive_delete(filelib:wildcard(dir() ++ "/*")),
761758
ok.
762759

760+
%% We need to make sure that we don't end up in a distributed Erlang
761+
%% system with nodes while not being in an Mnesia cluster with
762+
%% them. We don't handle that well.
763+
disconnect_nodes(Nodes) -> [erlang:disconnect_node(N) || N <- Nodes].
764+
763765
leave_cluster([], _) -> ok;
764766
leave_cluster(Nodes, RunningNodes) ->
765767
%% find at least one running cluster node and instruct it to

src/rabbit_tests.erl

Lines changed: 37 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -72,12 +72,10 @@ maybe_run_cluster_dependent_tests() ->
7272
run_cluster_dependent_tests(SecondaryNode) ->
7373
SecondaryNodeS = atom_to_list(SecondaryNode),
7474

75-
cover:stop(SecondaryNode),
7675
ok = control_action(stop_app, []),
77-
ok = control_action(reset, []),
76+
ok = safe_reset(),
7877
ok = control_action(cluster, [SecondaryNodeS]),
7978
ok = control_action(start_app, []),
80-
cover:start(SecondaryNode),
8179
ok = control_action(start_app, SecondaryNode, [], []),
8280

8381
io:format("Running cluster dependent tests with node ~p~n", [SecondaryNode]),
@@ -908,7 +906,7 @@ test_cluster_management2(SecondaryNode) ->
908906
ok = assert_ram_node(),
909907

910908
%% join cluster as a ram node
911-
ok = control_action(reset, []),
909+
ok = safe_reset(),
912910
ok = control_action(force_cluster, [SecondaryNodeS, "invalid1@invalid"]),
913911
ok = control_action(start_app, []),
914912
ok = control_action(stop_app, []),
@@ -965,29 +963,30 @@ test_cluster_management2(SecondaryNode) ->
965963
ok = assert_disc_node(),
966964

967965
%% turn a disk node into a ram node
968-
ok = control_action(reset, []),
966+
%%
967+
%% can't use safe_reset here since for some reason nodes()==[] and
968+
%% yet w/o stopping coverage things break
969+
with_suspended_cover(
970+
[SecondaryNode], fun () -> ok = control_action(reset, []) end),
969971
ok = control_action(cluster, [SecondaryNodeS]),
970972
ok = control_action(start_app, []),
971973
ok = control_action(stop_app, []),
972974
ok = assert_ram_node(),
973975

974976
%% NB: this will log an inconsistent_database error, which is harmless
975-
%% Turning cover on / off is OK even if we're not in general using cover,
976-
%% it just turns the engine on / off, doesn't actually log anything.
977-
cover:stop([SecondaryNode]),
978-
true = disconnect_node(SecondaryNode),
979-
pong = net_adm:ping(SecondaryNode),
980-
cover:start([SecondaryNode]),
977+
with_suspended_cover(
978+
[SecondaryNode], fun () ->
979+
true = disconnect_node(SecondaryNode),
980+
pong = net_adm:ping(SecondaryNode)
981+
end),
981982

982983
%% leaving a cluster as a ram node
983-
ok = control_action(reset, []),
984+
ok = safe_reset(),
984985
%% ...and as a disk node
985986
ok = control_action(cluster, [SecondaryNodeS, NodeS]),
986987
ok = control_action(start_app, []),
987988
ok = control_action(stop_app, []),
988-
cover:stop(SecondaryNode),
989-
ok = control_action(reset, []),
990-
cover:start(SecondaryNode),
989+
ok = safe_reset(),
991990

992991
%% attempt to leave cluster when no other node is alive
993992
ok = control_action(cluster, [SecondaryNodeS, NodeS]),
@@ -1002,22 +1001,39 @@ test_cluster_management2(SecondaryNode) ->
10021001
control_action(cluster, [SecondaryNodeS]),
10031002

10041003
%% leave system clustered, with the secondary node as a ram node
1005-
ok = control_action(force_reset, []),
1004+
with_suspended_cover(
1005+
[SecondaryNode], fun () -> ok = control_action(force_reset, []) end),
10061006
ok = control_action(start_app, []),
10071007
%% Yes, this is rather ugly. But since we're a clustered Mnesia
10081008
%% node and we're telling another clustered node to reset itself,
10091009
%% we will get disconnected half way through causing a
10101010
%% badrpc. This never happens in real life since rabbitmqctl is
1011-
%% not a clustered Mnesia node.
1012-
cover:stop(SecondaryNode),
1013-
{badrpc, nodedown} = control_action(force_reset, SecondaryNode, [], []),
1014-
pong = net_adm:ping(SecondaryNode),
1015-
cover:start(SecondaryNode),
1011+
%% not a clustered Mnesia node and is a hidden node.
1012+
with_suspended_cover(
1013+
[SecondaryNode],
1014+
fun () ->
1015+
{badrpc, nodedown} =
1016+
control_action(force_reset, SecondaryNode, [], []),
1017+
pong = net_adm:ping(SecondaryNode)
1018+
end),
10161019
ok = control_action(cluster, SecondaryNode, [NodeS], []),
10171020
ok = control_action(start_app, SecondaryNode, [], []),
10181021

10191022
passed.
10201023

1024+
%% 'cover' does not cope at all well with nodes disconnecting, which
1025+
%% happens as part of reset. So we turn it off temporarily. That is ok
1026+
%% even if we're not in general using cover, it just turns the engine
1027+
%% on / off and doesn't log anything.
1028+
safe_reset() -> with_suspended_cover(
1029+
nodes(), fun () -> control_action(reset, []) end).
1030+
1031+
with_suspended_cover(Nodes, Fun) ->
1032+
cover:stop(Nodes),
1033+
Res = Fun(),
1034+
cover:start(Nodes),
1035+
Res.
1036+
10211037
test_user_management() ->
10221038

10231039
%% lots if stuff that should fail

0 commit comments

Comments
 (0)