Skip to content

Commit 3c3a39b

Browse files
committed
Ensure AAE tree wait fails test if timeout
Some refactoring to make sure errors get reported properly when things go wrong. Wait until functions need to assert on the return value, and this return value is more useful if it contains errors from nodes timing out, Bad RPCs, etc.
1 parent 97a3905 commit 3c3a39b

File tree

1 file changed

+55
-39
lines changed

1 file changed

+55
-39
lines changed

src/rt.erl

Lines changed: 55 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -828,61 +828,77 @@ wait_until_nodes_agree_about_ownership(Nodes) ->
828828
%% AAE support
829829
wait_until_aae_trees_built(Nodes) ->
830830
lager:info("Wait until AAE builds all partition trees across ~p", [Nodes]),
831-
%% Wait until all nodes report no undefined trees
832-
wait_until(fun() ->
833-
lists:foldl(aae_tree_built_fun(), true, Nodes)
834-
end).
831+
BuiltFun = fun() -> lists:foldl(aae_tree_built_fun(), true, Nodes) end,
832+
?assertEqual(ok, wait_until(BuiltFun)),
833+
ok.
835834

836835
aae_tree_built_fun() ->
837-
fun(_, false) ->
838-
false;
839-
(Node, _AllBuilt = true) ->
840-
InfoRes = get_aae_tree_info(Node),
841-
case all_trees_have_build_times(InfoRes) of
842-
true ->
843-
{ok, Info} = InfoRes,
844-
Partitions = [I || {I, _} <- Info],
845-
lager:debug("Check if really built by locking"),
846-
847-
%% Try to lock each partition. If you get not_built,
848-
%% the manager has not detected the built process has
849-
%% died yet.
850-
%% Notice that the process locking is spawned by the
851-
%% pmap. That's important! as it should die eventually
852-
%% so the test can lock on the tree.
853-
AllBuilt = lists:all(fun(V) -> V == true end,
854-
rt:pmap(index_built_fun(Node), Partitions)),
855-
lager:debug("For node ~p all built = ~p", [Node, AllBuilt]),
856-
AllBuilt;
857-
false ->
858-
false
859-
end
836+
fun(Node, _AllBuilt = true) ->
837+
case get_aae_tree_info(Node) of
838+
{ok, TreeInfos} ->
839+
case all_trees_have_build_times(TreeInfos) of
840+
true ->
841+
Partitions = [I || {I, _} <- TreeInfos],
842+
all_aae_trees_built(Node, Partitions);
843+
false ->
844+
some_trees_not_built
845+
end;
846+
Err ->
847+
Err
848+
end;
849+
(_Node, Err) ->
850+
Err
851+
end.
852+
853+
% It is unlikely but possible to get a tree built time from compute_tree_info
854+
% but an attempt to use the tree returns not_built. This is because the build
855+
% process has finished, but the lock on the tree won't be released until it
856+
% dies and the manager detects it. Yes, this is super freaking paranoid.
857+
all_aae_trees_built(Node, Partitions) ->
858+
%% Notice that the process locking is spawned by the
859+
%% pmap. That's important! as it should die eventually
860+
%% so the lock is released and the test can lock the tree.
861+
IndexBuilts = rt:pmap(index_built_fun(Node), Partitions),
862+
BadOnes = [R || R <- IndexBuilts, R /= true],
863+
case BadOnes of
864+
[] ->
865+
true;
866+
_ ->
867+
BadOnes
860868
end.
861869

862870
get_aae_tree_info(Node) ->
863871
case rpc:call(Node, riak_kv_entropy_info, compute_tree_info, []) of
864872
{badrpc, _} ->
865-
{error, badrpc};
873+
{error, {badrpc, Node}};
866874
Info ->
867875
lager:debug("Entropy table on node ~p : ~p", [Node, Info]),
868876
{ok, Info}
869877
end.
870878

871-
all_trees_have_build_times({badrpc, _}) ->
872-
false;
873-
all_trees_have_build_times({ok, Info}) ->
879+
all_trees_have_build_times(Info) ->
874880
not lists:keymember(undefined, 2, Info).
875881

876882
index_built_fun(Node) ->
877883
fun(Idx) ->
878-
{ok, TreePid} = rpc:call(Node, riak_kv_vnode,
879-
hashtree_pid, [Idx]),
880-
TreeLocked =
881-
rpc:call(Node, riak_kv_index_hashtree, get_lock,
882-
[TreePid, for_riak_test]),
883-
lager:debug("Partition ~p : ~p", [Idx, TreeLocked]),
884-
TreeLocked == ok
885-
orelse TreeLocked == already_locked
884+
case rpc:call(Node, riak_kv_vnode,
885+
hashtree_pid, [Idx]) of
886+
{ok, TreePid} ->
887+
case rpc:call(Node, riak_kv_index_hashtree,
888+
get_lock, [TreePid, for_riak_test]) of
889+
{badrpc, _} ->
890+
{error, {badrpc, Node}};
891+
TreeLocked when TreeLocked == ok;
892+
TreeLocked == already_locked ->
893+
true;
894+
Err ->
895+
% Either not_built or some unhandled result,
896+
% in which case update this case please!
897+
{error, {index_not_built, Node, Idx, Err}}
898+
end;
899+
{badrpc, _} ->
900+
{error, {badrpc, Node}}
901+
end
886902
end.
887903

888904
%%%===================================================================

0 commit comments

Comments
 (0)