Skip to content

Commit bda1b5c

Browse files
committed
Merge pull request #543 from basho/refactor/one-wait-4-aae-trees
Merge repl and rt versions of wait until AAE trees build
2 parents 29def51 + 31588c5 commit bda1b5c

File tree

5 files changed

+73
-72
lines changed

5 files changed

+73
-72
lines changed

src/rt.erl

Lines changed: 55 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -828,61 +828,77 @@ wait_until_nodes_agree_about_ownership(Nodes) ->
828828
%% AAE support
829829
wait_until_aae_trees_built(Nodes) ->
830830
lager:info("Wait until AAE builds all partition trees across ~p", [Nodes]),
831-
%% Wait until all nodes report no undefined trees
832-
wait_until(fun() ->
833-
lists:foldl(aae_tree_built_fun(), true, Nodes)
834-
end).
831+
BuiltFun = fun() -> lists:foldl(aae_tree_built_fun(), true, Nodes) end,
832+
?assertEqual(ok, wait_until(BuiltFun)),
833+
ok.
835834

836835
aae_tree_built_fun() ->
837-
fun(_, false) ->
838-
false;
839-
(Node, _AllBuilt = true) ->
840-
InfoRes = get_aae_tree_info(Node),
841-
case all_trees_have_build_times(InfoRes) of
842-
true ->
843-
{ok, Info} = InfoRes,
844-
Partitions = [I || {I, _} <- Info],
845-
lager:debug("Check if really built by locking"),
846-
847-
%% Try to lock each partition. If you get not_built,
848-
%% the manager has not detected the built process has
849-
%% died yet.
850-
%% Notice that the process locking is spawned by the
851-
%% pmap. That's important! as it should die eventually
852-
%% so the test can lock on the tree.
853-
AllBuilt = lists:all(fun(V) -> V == true end,
854-
rt:pmap(index_built_fun(Node), Partitions)),
855-
lager:debug("For node ~p all built = ~p", [Node, AllBuilt]),
856-
AllBuilt;
857-
false ->
858-
false
859-
end
836+
fun(Node, _AllBuilt = true) ->
837+
case get_aae_tree_info(Node) of
838+
{ok, TreeInfos} ->
839+
case all_trees_have_build_times(TreeInfos) of
840+
true ->
841+
Partitions = [I || {I, _} <- TreeInfos],
842+
all_aae_trees_built(Node, Partitions);
843+
false ->
844+
some_trees_not_built
845+
end;
846+
Err ->
847+
Err
848+
end;
849+
(_Node, Err) ->
850+
Err
851+
end.
852+
853+
% It is unlikely but possible to get a tree built time from compute_tree_info
854+
% but an attempt to use the tree returns not_built. This is because the build
855+
% process has finished, but the lock on the tree won't be released until it
856+
% dies and the manager detects it. Yes, this is super freaking paranoid.
857+
all_aae_trees_built(Node, Partitions) ->
858+
%% Notice that the process locking is spawned by the
859+
%% pmap. That's important! as it should die eventually
860+
%% so the lock is released and the test can lock the tree.
861+
IndexBuilts = rt:pmap(index_built_fun(Node), Partitions),
862+
BadOnes = [R || R <- IndexBuilts, R /= true],
863+
case BadOnes of
864+
[] ->
865+
true;
866+
_ ->
867+
BadOnes
860868
end.
861869

862870
get_aae_tree_info(Node) ->
863871
case rpc:call(Node, riak_kv_entropy_info, compute_tree_info, []) of
864872
{badrpc, _} ->
865-
{error, badrpc};
873+
{error, {badrpc, Node}};
866874
Info ->
867875
lager:debug("Entropy table on node ~p : ~p", [Node, Info]),
868876
{ok, Info}
869877
end.
870878

871-
all_trees_have_build_times({badrpc, _}) ->
872-
false;
873-
all_trees_have_build_times({ok, Info}) ->
879+
all_trees_have_build_times(Info) ->
874880
not lists:keymember(undefined, 2, Info).
875881

876882
index_built_fun(Node) ->
877883
fun(Idx) ->
878-
{ok, TreePid} = rpc:call(Node, riak_kv_vnode,
879-
hashtree_pid, [Idx]),
880-
TreeLocked =
881-
rpc:call(Node, riak_kv_index_hashtree, get_lock,
882-
[TreePid, for_riak_test]),
883-
lager:debug("Partition ~p : ~p", [Idx, TreeLocked]),
884-
TreeLocked == ok
885-
orelse TreeLocked == already_locked
884+
case rpc:call(Node, riak_kv_vnode,
885+
hashtree_pid, [Idx]) of
886+
{ok, TreePid} ->
887+
case rpc:call(Node, riak_kv_index_hashtree,
888+
get_lock, [TreePid, for_riak_test]) of
889+
{badrpc, _} ->
890+
{error, {badrpc, Node}};
891+
TreeLocked when TreeLocked == ok;
892+
TreeLocked == already_locked ->
893+
true;
894+
Err ->
895+
% Either not_built or some unhandled result,
896+
% in which case update this case please!
897+
{error, {index_not_built, Node, Idx, Err}}
898+
end;
899+
{badrpc, _} ->
900+
{error, {badrpc, Node}}
901+
end
886902
end.
887903

888904
%%%===================================================================

tests/repl_aae_fullsync.erl

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -91,8 +91,8 @@ simple_test() ->
9191
read_from_cluster(BFirst, 1, ?NUM_KEYS, ?NUM_KEYS),
9292

9393
%% Wait for trees to compute.
94-
repl_util:wait_until_aae_trees_built(ANodes),
95-
repl_util:wait_until_aae_trees_built(BNodes),
94+
rt:wait_until_aae_trees_built(ANodes),
95+
rt:wait_until_aae_trees_built(BNodes),
9696

9797
lager:info("Test fullsync from cluster A leader ~p to cluster B",
9898
[LeaderA]),
@@ -186,9 +186,9 @@ dual_test() ->
186186
rt:wait_until_ring_converged(ANodes),
187187

188188
%% Wait for trees to compute.
189-
repl_util:wait_until_aae_trees_built(ANodes),
190-
repl_util:wait_until_aae_trees_built(BNodes),
191-
repl_util:wait_until_aae_trees_built(CNodes),
189+
rt:wait_until_aae_trees_built(ANodes),
190+
rt:wait_until_aae_trees_built(BNodes),
191+
rt:wait_until_aae_trees_built(CNodes),
192192

193193
%% Flush AAE trees to disk.
194194
perform_sacrifice(AFirst),
@@ -278,7 +278,7 @@ bidirectional_test() ->
278278
perform_sacrifice(AFirst),
279279

280280
%% Wait for trees to compute.
281-
repl_util:wait_until_aae_trees_built(ANodes),
281+
rt:wait_until_aae_trees_built(ANodes),
282282

283283
%% Verify A replicated to B.
284284
validate_completed_fullsync(LeaderA, BFirst, "B", 1, ?NUM_KEYS),
@@ -291,7 +291,7 @@ bidirectional_test() ->
291291
perform_sacrifice(BFirst),
292292

293293
%% Wait for trees to compute.
294-
repl_util:wait_until_aae_trees_built(BNodes),
294+
rt:wait_until_aae_trees_built(BNodes),
295295

296296
%% Verify B replicated to A.
297297
validate_completed_fullsync(LeaderB, AFirst, "A", ?NUM_KEYS + 1, ?NUM_KEYS + ?NUM_KEYS),
@@ -350,8 +350,8 @@ difference_test() ->
350350
[{timeout, 4000}]),
351351

352352
%% Wait for trees to compute.
353-
repl_util:wait_until_aae_trees_built(ANodes),
354-
repl_util:wait_until_aae_trees_built(BNodes),
353+
rt:wait_until_aae_trees_built(ANodes),
354+
rt:wait_until_aae_trees_built(BNodes),
355355

356356
lager:info("Test fullsync from cluster A leader ~p to cluster B",
357357
[LeaderA]),
@@ -436,8 +436,8 @@ deadlock_test() ->
436436
[ok = rt_intercept:add(Target, Intercept) || Target <- ANodes],
437437

438438
%% Wait for trees to compute.
439-
repl_util:wait_until_aae_trees_built(ANodes),
440-
repl_util:wait_until_aae_trees_built(BNodes),
439+
rt:wait_until_aae_trees_built(ANodes),
440+
rt:wait_until_aae_trees_built(BNodes),
441441

442442
lager:info("Test fullsync from cluster A leader ~p to cluster B",
443443
[LeaderA]),
@@ -579,7 +579,7 @@ validate_intercepted_fullsync(InterceptTarget,
579579
rt:wait_for_service(InterceptTarget, riak_repl),
580580

581581
%% Wait until AAE trees are compueted on the rebooted node.
582-
repl_util:wait_until_aae_trees_built([InterceptTarget]).
582+
rt:wait_until_aae_trees_built([InterceptTarget]).
583583

584584
%% @doc Given a node, find the port that the cluster manager is
585585
%% listening on.

tests/repl_aae_fullsync_util.erl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -77,8 +77,8 @@ prepare_cluster_data(TestBucket, NumKeysAOnly, _NumKeysBoth, [AFirst|_] = ANodes
7777
?assertEqual(NumKeysAOnly, length(Res2)),
7878

7979
%% wait for the AAE trees to be built so that we don't get a not_built error
80-
repl_util:wait_until_aae_trees_built(ANodes),
81-
repl_util:wait_until_aae_trees_built(BNodes),
80+
rt:wait_until_aae_trees_built(ANodes),
81+
rt:wait_until_aae_trees_built(BNodes),
8282

8383
ok.
8484

tests/repl_fs_bench.erl

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -122,27 +122,27 @@ fullsync_test(Strategy, Latency) ->
122122
?assertEqual(ok, repl_util:wait_for_connection(LeaderA, "B")),
123123

124124
%% Perform fullsync of an empty cluster.
125-
repl_util:wait_until_aae_trees_built(ANodes ++ BNodes),
125+
rt:wait_until_aae_trees_built(ANodes ++ BNodes),
126126
{EmptyTime, _} = timer:tc(repl_util,
127127
start_and_wait_until_fullsync_complete,
128128
[LeaderA]),
129129

130130
%% Write keys and perform fullsync.
131131
repl_util:write_to_cluster(AFirst, 0, ?FULL_NUM_KEYS, ?TEST_BUCKET),
132-
repl_util:wait_until_aae_trees_built(ANodes ++ BNodes),
132+
rt:wait_until_aae_trees_built(ANodes ++ BNodes),
133133
{FullTime, _} = timer:tc(repl_util,
134134
start_and_wait_until_fullsync_complete,
135135
[LeaderA]),
136136

137137
%% Rewrite first 10% keys and perform fullsync.
138138
repl_util:write_to_cluster(AFirst, 0, ?DIFF_NUM_KEYS, ?TEST_BUCKET),
139-
repl_util:wait_until_aae_trees_built(ANodes ++ BNodes),
139+
rt:wait_until_aae_trees_built(ANodes ++ BNodes),
140140
{DiffTime, _} = timer:tc(repl_util,
141141
start_and_wait_until_fullsync_complete,
142142
[LeaderA]),
143143

144144
%% Write no keys, and perform the fullsync.
145-
repl_util:wait_until_aae_trees_built(ANodes ++ BNodes),
145+
rt:wait_until_aae_trees_built(ANodes ++ BNodes),
146146
{NoneTime, _} = timer:tc(repl_util,
147147
start_and_wait_until_fullsync_complete,
148148
[LeaderA]),

tests/repl_util.erl

Lines changed: 0 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
wait_until_leader_converge/1,
1313
wait_until_connection/1,
1414
wait_until_no_connection/1,
15-
wait_until_aae_trees_built/1,
1615
wait_for_reads/5,
1716
start_and_wait_until_fullsync_complete/1,
1817
start_and_wait_until_fullsync_complete/2,
@@ -329,20 +328,6 @@ nodes_with_version(Nodes, Version) ->
329328
nodes_all_have_version(Nodes, Version) ->
330329
Nodes == nodes_with_version(Nodes, Version).
331330

332-
%% AAE support
333-
wait_until_aae_trees_built(Cluster) ->
334-
lager:info("Check if all trees built for nodes ~p", [Cluster]),
335-
F = fun(Node) ->
336-
Info = rpc:call(Node,
337-
riak_kv_entropy_info,
338-
compute_tree_info,
339-
[]),
340-
NotBuilt = [X || {_,undefined}=X <- Info],
341-
NotBuilt == []
342-
end,
343-
[rt:wait_until(Node, F) || Node <- Cluster],
344-
ok.
345-
346331
%% Return the number of partitions in the cluster where Node is a member.
347332
num_partitions(Node) ->
348333
{ok, Ring} = rpc:call(Node, riak_core_ring_manager, get_raw_ring, []),

0 commit comments

Comments
 (0)