diff --git a/src/rt.erl b/src/rt.erl index 96f31e2c1..09bfa768d 100644 --- a/src/rt.erl +++ b/src/rt.erl @@ -449,7 +449,7 @@ staged_join(Node, PNode) -> plan_and_commit(Node) -> timer:sleep(500), - lager:info("planning and commiting cluster join"), + lager:info("planning cluster join"), case rpc:call(Node, riak_core_claimant, plan, []) of {error, ring_not_ready} -> lager:info("plan: ring not ready"), @@ -461,6 +461,7 @@ plan_and_commit(Node) -> end. do_commit(Node) -> + lager:info("planning cluster commit"), case rpc:call(Node, riak_core_claimant, commit, []) of {error, plan_changed} -> lager:info("commit: plan changed"), @@ -472,8 +473,9 @@ do_commit(Node) -> timer:sleep(100), maybe_wait_for_changes(Node), do_commit(Node); - {error,nothing_planned} -> + {error, nothing_planned} -> %% Assume plan actually committed somehow + lager:info("commit: nothing planned"), ok; ok -> ok @@ -719,7 +721,13 @@ wait_until_no_pending_changes(Nodes) -> rpc:multicall(Nodes, riak_core_vnode_manager, force_handoffs, []), {Rings, BadNodes} = rpc:multicall(Nodes, riak_core_ring_manager, get_raw_ring, []), Changes = [ riak_core_ring:pending_changes(Ring) =:= [] || {ok, Ring} <- Rings ], - BadNodes =:= [] andalso length(Changes) =:= length(Nodes) andalso lists:all(fun(T) -> T end, Changes) + case BadNodes =:= [] andalso length(Changes) =:= length(Nodes) andalso lists:all(fun(T) -> T end, Changes) of + true -> true; + false -> + NodesWithChanges = [Node || {Node, false} <- lists:zip(Nodes -- BadNodes, Changes)], + lager:info("Changes not yet complete, or bad nodes. BadNodes=~p, Nodes with Pending Changes=~p~n", [BadNodes, NodesWithChanges]), + false + end end, ?assertEqual(ok, wait_until(F)), ok. diff --git a/tests/bucket_types.erl b/tests/bucket_types.erl index 1715cee82..7e426be5a 100644 --- a/tests/bucket_types.erl +++ b/tests/bucket_types.erl @@ -10,7 +10,11 @@ confirm() -> lager:info("Deploy some nodes"), Nodes = rt:build_cluster(4, [], [ {riak_core, [{default_bucket_props, - [{n_val, 2}]}]}]), + [ + {n_val, 2}, + {allow_mult, true}, + {dvv_enabled, true} + ]}]}]), Node = hd(Nodes), RMD = riak_test_runner:metadata(), diff --git a/tests/ensemble_byzantine.erl b/tests/ensemble_byzantine.erl index 0449fc574..155f1d5b0 100644 --- a/tests/ensemble_byzantine.erl +++ b/tests/ensemble_byzantine.erl @@ -56,7 +56,7 @@ confirm() -> test_lose_minority_synctrees(PBC, Bucket, Key, Val, PL), test_lose_majority_synctrees(PBC, Bucket, Key, Val, PL), test_lose_minority_synctrees_one_node_partitioned(PBC, Bucket, Key, Val, - PL, Nodes), + PL, Nodes), test_lose_all_data_and_trees_except_one_node(PBC, Bucket, Key, Val, PL), {ok, _NewVal} = test_backup_restore_data_not_trees(Bucket, Key, Val, PL), test_lose_all_data(PBC, Bucket, Key, PL), @@ -64,7 +64,12 @@ confirm() -> pass. config() -> - [{riak_core, [{default_bucket_props, [{n_val, 5}]}, + [{riak_core, [{default_bucket_props, + [ + {n_val, 5}, + {allow_mult, true}, + {dvv_enabled, true} + ]}, {vnode_management_timer, 1000}, {ring_creation_size, 16}, {enable_consensus, true}, @@ -79,7 +84,7 @@ test_lose_majority_synctrees(PBC, Bucket, Key, Val, PL) -> assert_lose_synctrees_and_recover(PBC, Bucket, Key, Val, PL, Majority). test_lose_minority_synctrees_one_node_partitioned(PBC, Bucket, Key, Val, PL, - Nodes) -> + Nodes) -> Minority = minority_vnodes(PL), {{Idx0, Node0}, primary} = hd(PL), Ensemble = {kv, Idx0, 5}, @@ -251,7 +256,7 @@ kill_peers(Ensemble, Nodes) -> Peers = [P || P={_Id, N} <- View, lists:member(N, Nodes)], lager:info("Killing Peers: ~p", [Peers]), Pids = [rpc:call(Node, riak_ensemble_manager, get_peer_pid, - [Ensemble, Peer]) || Peer <- Peers], + [Ensemble, Peer]) || Peer <- Peers], [exit(Pid, kill) || Pid <- Pids, Pid =/= undefined]. wipe_partitions(PL) -> diff --git a/tests/ensemble_ring_changes.erl b/tests/ensemble_ring_changes.erl index 6823d76ee..59e9bc365 100644 --- a/tests/ensemble_ring_changes.erl +++ b/tests/ensemble_ring_changes.erl @@ -27,7 +27,12 @@ -define(RING_SIZE, 16). config() -> - [{riak_core, [{default_bucket_props, [{n_val, 5}]}, + [{riak_core, [{default_bucket_props, + [ + {n_val, 5}, + {allow_mult, true}, + {dvv_enabled, true} + ]}, {vnode_management_timer, 1000}, {ring_creation_size, ?RING_SIZE}, {enable_consensus, true}, diff --git a/tests/ensemble_util.erl b/tests/ensemble_util.erl index d6f79145b..b3bca16b3 100644 --- a/tests/ensemble_util.erl +++ b/tests/ensemble_util.erl @@ -58,10 +58,15 @@ fast_config(Nval, EnableAAE) when is_boolean(EnableAAE) -> fast_config(NVal, RingSize, EnableAAE) -> [config_aae(EnableAAE), - {riak_core, [{default_bucket_props, [{n_val, NVal}]}, - {vnode_management_timer, 1000}, - {ring_creation_size, RingSize}, - {enable_consensus, true}]}]. + {riak_core, [{default_bucket_props, + [ + {n_val, NVal}, + {allow_mult, true}, + {dvv_enabled, true} + ]}, + {vnode_management_timer, 1000}, + {ring_creation_size, RingSize}, + {enable_consensus, true}]}]. config_aae(true) -> {riak_kv, [{anti_entropy_build_limit, {100, 1000}}, diff --git a/tests/http_bucket_types.erl b/tests/http_bucket_types.erl index d5994b159..2ffe118c0 100644 --- a/tests/http_bucket_types.erl +++ b/tests/http_bucket_types.erl @@ -13,7 +13,11 @@ confirm() -> lager:info("Deploy some nodes"), Nodes = rt:build_cluster(4, [], [ {riak_core, [{default_bucket_props, - [{n_val, 2}]}]}]), + [ + {n_val, 2}, + {allow_mult, true}, + {dvv_enabled, true} + ]}]}]), Node = hd(Nodes), RMD = riak_test_runner:metadata(), diff --git a/tests/http_security.erl b/tests/http_security.erl index 8bf09ddb6..fac70e329 100644 --- a/tests/http_security.erl +++ b/tests/http_security.erl @@ -30,7 +30,7 @@ confirm() -> PrivDir = rt:priv_dir(), Conf = [ {riak_core, [ - {default_bucket_props, [{allow_mult, true}]}, + {default_bucket_props, [{allow_mult, true}, {dvv_enabled, true}]}, {ssl, [ {certfile, filename:join([CertDir, "site3.basho.com/cert.pem"])}, diff --git a/tests/overload.erl b/tests/overload.erl index 54b17adcd..819f029a7 100644 --- a/tests/overload.erl +++ b/tests/overload.erl @@ -59,7 +59,12 @@ default_config(#config{ fsm_limit = FsmLimit }) -> [{riak_core, [{ring_creation_size, 8}, - {default_bucket_props, [{n_val, 5}]}, + {default_bucket_props, + [ + {n_val, 5}, + {allow_mult, true}, + {dvv_enabled, true} + ]}, {vnode_management_timer, 1000}, {enable_health_checks, false}, {enable_consensus, true}, diff --git a/tests/pb_security.erl b/tests/pb_security.erl index 3b06d1d7a..00d1bac96 100644 --- a/tests/pb_security.erl +++ b/tests/pb_security.erl @@ -53,7 +53,7 @@ confirm() -> PrivDir = rt:priv_dir(), Conf = [ {riak_core, [ - {default_bucket_props, [{allow_mult, true}]}, + {default_bucket_props, [{allow_mult, true}, {dvv_enabled, true}]}, {ssl, [ {certfile, filename:join([CertDir,"site3.basho.com/cert.pem"])}, {keyfile, filename:join([CertDir, "site3.basho.com/key.pem"])}, diff --git a/tests/proxy_overload_recovery.erl b/tests/proxy_overload_recovery.erl index 111320422..400d21ea0 100644 --- a/tests/proxy_overload_recovery.erl +++ b/tests/proxy_overload_recovery.erl @@ -17,14 +17,14 @@ %% cleared and the model is reset. The main goal is to advance the proxy %% into interesting new states. %% -%% This test can be run outside of riak_test while working on it. +%% This test can be run outside of riak_test while working on it. %% Symlink the source into a release build and run -%% c(proxy_overload_recovery). +%% c(proxy_overload_recovery). %% proxy_overload_recovery:run(300). % Run for 5 mins %% %% On failure you can re-run counter examples *and* print out the internal %% state with the run. -%% proxy_overload_recovery:check(). +%% proxy_overload_recovery:check(). %% %% TODO/Questions: %% 1) Is there a better way to do the initialization step? @@ -137,7 +137,7 @@ prop_proxy_recovery() -> [catch msgq_len(VPid)]) end end, - measure(duration, Msecs, + measure(duration, Msecs, aggregate(with_title("Commands"), command_names(Cmds), pretty_commands(?MODULE, Cmds, {H, S, Res}, Res == ok)))) @@ -173,13 +173,13 @@ precondition_common(#tstate{rt = undefined}, {call, _M, F, _A}) -> precondition_common(_, {call, _M, F, _A}) -> F /= prepare. -%% %% Make sure we're still running what we think we're running - uncomment +%% %% Make sure we're still running what we think we're running - uncomment %% %% if having process death issues %% invariant(#tstate{rt = undefined}) -> %% true; %% invariant(#tstate{rt = #rt{id = Index, ppid = PPid, vpid = VPid}}) -> %% RegName = riak_core_vnode_proxy:reg_name(riak_kv_vnode, Index), -%% PPid = whereis(RegName), % Check process we think it is. +%% PPid = whereis(RegName), % Check process we think it is. %% true = is_process_alive(PPid), %% true = is_process_alive(VPid), %% true. @@ -208,13 +208,14 @@ prepare(ThresholdSeed) -> {ok, VPid0} = riak_core_vnode_manager:get_vnode_pid(Id, riak_kv_vnode), sys:resume(VPid0), ok = supervisor:terminate_child(riak_core_vnode_sup, VPid0), - false = is_process_alive(VPid0), %% Reset the proxy pid to make sure it resets state and picks up the new %% environment variables ok = supervisor:terminate_child(riak_core_vnode_proxy_sup, {riak_kv_vnode, Id}), RegName = riak_core_vnode_proxy:reg_name(riak_kv_vnode, Index), undefined = whereis(RegName), + VPid1 = wait_for_vnode_change(VPid0, Index), + {ok, PPid} = supervisor:restart_child(riak_core_vnode_proxy_sup, {riak_kv_vnode, Id}), %% Find the proxy pid and check it's alive and matches the supervisor @@ -225,6 +226,7 @@ prepare(ThresholdSeed) -> %% and return the Pid so we know we have the same Pid. {ok, VPid} = riak_core_vnode_proxy:command_return_vnode( {riak_kv_vnode,Index,node()}, timeout), + ?assertEqual(VPid, VPid1), true = is_process_alive(PPid), true = is_process_alive(VPid), @@ -264,14 +266,14 @@ resume_args(#tstate{rt = RT}) -> resume(#rt{ppid = PPid, vpid = VPid}) -> sys:resume(VPid), %% Use the sys:get_status call to force a synchronous call - %% against the vnode proxy to ensure all messages sent by + %% against the vnode & the proxy to ensure all messages sent by %% this process have been serviced and there are no pending %% 'ping's in the vnode before we continue. %% Then drain the vnode to make sure any pending pongs have - %% been sent. - ok = drain(VPid), + %% been sent, and ensure the proxy has + _ = sys:get_status(PPid), _ = sys:get_status(VPid), - _ = sys:get_status(PPid). + ok = drain([VPid, PPid]). resume_next(S, _V, _A) -> S#tstate{vnode_running = true, proxy_msgs = 0, direct_msgs = 0}. @@ -324,28 +326,28 @@ overloaded_args(#tstate{vnode_running = Running, rt = RT}) -> overloaded(Running, #rt{ppid = PPid, vpid = VPid}) -> case Running of true -> - ok = drain(PPid), % make sure all proxy msgs processed/dropped - ok = drain(VPid); % make sure any pending ping/pongs are processed + ok = drain([PPid, VPid]); _ -> ok end, - {riak_core_vnode_proxy:overloaded(PPid), - msgq_len(VPid), % just for debug so we can review in log output - sys:get_status(PPid)}. % ditto + {messages, PMsgs} = process_info(PPid, messages), + {messages, VMsgs} = process_info(VPid, messages), + Overloaded = riak_core_vnode_proxy:overloaded(PPid), + {Overloaded, {VMsgs, PMsgs}, sys:get_status(PPid)}. overloaded_post(#tstate{threshold = undefined}, _A, - {R, _VnodeQ, _ProxyStatus}) -> + {R, _Messages, _ProxyStatus}) -> %% If there are no thresholds there should never be an overload eq(R, false); overloaded_post(#tstate{vnode_running = true}, _A, - {R, _VnodeQ = 0, _ProxyStatus}) -> + {R, _Messages, _ProxyStatus}) -> %% If the vnode is running, we have cleared queues so %% should not be in overload. eq(R, false); overloaded_post(#tstate{vnode_running = false, proxy_msgs = ProxyMsgs, threshold = Threshold}, _A, - {ResultOverload, _VnodeQ, _ProxyStatus}) -> + {ResultOverload, _Messages, _ProxyStatus}) -> %% Either %% mailbox is completely an estimate based on proxy msgs %% or mailbox is a check + estimate since @@ -392,16 +394,33 @@ prep_env(Var, Val) -> %% Wait until all messages are drained by the Pid. No guarantees %% about future messages being sent, or that responses for the %% last message consumed have been transmitted. -%% -drain(Pid) -> - case erlang:process_info(Pid, message_queue_len) of - {message_queue_len, 0} -> +%% NOTE: The "drain 3 times in a row" was determined empirically, +%% and may not be sufficient (2 was not). Given time constraints, +%% living with it for now. If this fails, we should really add some +%% tracing code around the send of messages to Vnode and Proxy to +%% determine where extra messages are coming from rather than just +%% make this "try 4 times" +%% +drain(Pid) when is_pid(Pid) -> + drain([Pid], {-1, -1}); + +drain(Pids) when is_list(Pids) -> + drain(Pids, {-1, -1}). +drain(Pids, {PrevPrev, Prev}) -> + _ = [sys:suspend(Pid) || Pid <- Pids], + Len = lists:foldl(fun(Pid, Acc0) -> + {message_queue_len, Len} = erlang:process_info(Pid, message_queue_len), + Acc0 + Len + end, 0, Pids), + _ = [sys:resume(Pid) || Pid <- Pids], + case {PrevPrev, Prev, Len} of + {0, 0, 0} -> ok; - {message_queue_len, L} when L > 0 -> - timer:sleep(1), % give it a millisecond to drain - drain(Pid); - ER -> - ER + _ -> + %% Attempt to ensure something else is scheduled before we try to drain again + erlang:yield(), + timer:sleep(1), + drain(Pids, {Prev, Len}) end. %% Return the length of the message queue (or crash if proc dead) @@ -448,6 +467,17 @@ add_eqc_apps(Nodes) -> end || App <- Apps, Node <- Nodes], ok. + +wait_for_vnode_change(VPid0, Index) -> + {ok, VPid1} = riak_core_vnode_manager:get_vnode_pid(Index, riak_kv_vnode), + case VPid1 of + VPid0 -> + timer:sleep(1), + wait_for_vnode_change(VPid0, Index); + _ -> + VPid1 + end. + -else. %% no EQC -export([confirm/0]). diff --git a/tests/repl_aae_fullsync.erl b/tests/repl_aae_fullsync.erl index a04e86012..41397831b 100644 --- a/tests/repl_aae_fullsync.erl +++ b/tests/repl_aae_fullsync.erl @@ -16,7 +16,12 @@ {riak_core, [ {ring_creation_size, 8}, - {default_bucket_props, [{n_val, 1}]} + {default_bucket_props, + [ + {n_val, 1}, + {allow_mult, true}, + {dvv_enabled, true} + ]} ] }, {riak_kv, @@ -46,6 +51,8 @@ confirm() -> pass. simple_test() -> + lager:info("Starting simple_test"), + %% Deploy 6 nodes. Nodes = rt:deploy_nodes(6, ?CONF(5), [riak_kv, riak_repl]), @@ -118,6 +125,8 @@ simple_test() -> pass. dual_test() -> + lager:info("Starting dual_test"), + %% Deploy 6 nodes. Nodes = rt:deploy_nodes(6, ?CONF(infinity), [riak_kv, riak_repl]), @@ -218,6 +227,8 @@ dual_test() -> pass. bidirectional_test() -> + lager:info("Starting bidirectional_test"), + %% Deploy 6 nodes. Nodes = rt:deploy_nodes(6, ?CONF(5), [riak_kv, riak_repl]), @@ -301,6 +312,8 @@ bidirectional_test() -> pass. difference_test() -> + lager:info("Starting difference_test"), + %% Deploy 6 nodes. Nodes = rt:deploy_nodes(6, ?CONF(5), [riak_kv, riak_repl]), @@ -393,6 +406,8 @@ difference_test() -> pass. deadlock_test() -> + lager:info("Starting deadlock_test"), + %% Deploy 6 nodes. Nodes = rt:deploy_nodes(6, ?CONF(5), [riak_kv, riak_repl]), diff --git a/tests/repl_aae_fullsync_custom_n.erl b/tests/repl_aae_fullsync_custom_n.erl index a8294bcd2..715510b03 100644 --- a/tests/repl_aae_fullsync_custom_n.erl +++ b/tests/repl_aae_fullsync_custom_n.erl @@ -22,7 +22,12 @@ confirm() -> {riak_core, [ {ring_creation_size, 8}, - {default_bucket_props, [{n_val, 1}]} + {default_bucket_props, + [ + {n_val, 1}, + {allow_mult, true}, + {dvv_enabled, true} + ]} ] }, {riak_kv, diff --git a/tests/repl_cancel_fullsync.erl b/tests/repl_cancel_fullsync.erl index 03c69b9ef..bcbea39ef 100644 --- a/tests/repl_cancel_fullsync.erl +++ b/tests/repl_cancel_fullsync.erl @@ -11,7 +11,12 @@ {riak_core, [ {ring_creation_size, 8}, - {default_bucket_props, [{n_val, 1}]} + {default_bucket_props, + [ + {n_val, 1}, + {allow_mult, true}, + {dvv_enabled, true} + ]} ] }, {riak_kv, diff --git a/tests/repl_location_failures.erl b/tests/repl_location_failures.erl index cb74985f5..6151a4546 100644 --- a/tests/repl_location_failures.erl +++ b/tests/repl_location_failures.erl @@ -13,7 +13,12 @@ {riak_core, [ {ring_creation_size, 8}, - {default_bucket_props, [{n_val, 1}]} + {default_bucket_props, + [ + {n_val, 1}, + {allow_mult, true}, + {dvv_enabled, true} + ]} ] }, {riak_kv, diff --git a/tests/replication_object_reformat.erl b/tests/replication_object_reformat.erl index cb1cf6851..4112d9a57 100644 --- a/tests/replication_object_reformat.erl +++ b/tests/replication_object_reformat.erl @@ -11,7 +11,12 @@ {riak_core, [ {ring_creation_size, 8}, - {default_bucket_props, [{n_val, ?N}]} + {default_bucket_props, + [ + {n_val, ?N}, + {allow_mult, true}, + {dvv_enabled, true} + ]} ] }, {riak_kv, diff --git a/tests/verify_counter_repl.erl b/tests/verify_counter_repl.erl index 79a107b37..7e1a282ca 100644 --- a/tests/verify_counter_repl.erl +++ b/tests/verify_counter_repl.erl @@ -62,7 +62,7 @@ confirm() -> make_clusters() -> Conf = [{riak_repl, [{fullsync_on_connect, false}, {fullsync_interval, disabled}]}, - {riak_core, [{default_bucket_props, [{allow_mult, true}]}]}], + {riak_core, [{default_bucket_props, [{allow_mult, true}, {dvv_enabled, true}]}]}], Nodes = rt:deploy_nodes(6, Conf, [riak_kv, riak_repl]), {ClusterA, ClusterB} = lists:split(3, Nodes), A = make_cluster(ClusterA, "A"), diff --git a/tests/verify_handoff_write_once.erl b/tests/verify_handoff_write_once.erl index 9753946ed..1e311577b 100644 --- a/tests/verify_handoff_write_once.erl +++ b/tests/verify_handoff_write_once.erl @@ -60,7 +60,12 @@ confirm() -> create_config(Backend) -> [{riak_core, [ - {default_bucket_props, [{n_val, 1}]}, + {default_bucket_props, + [ + {n_val, 1}, + {allow_mult, true}, + {dvv_enabled, true} + ]}, {ring_creation_size, 8}, {handoff_acksync_threshold, 20}, {handoff_concurrency, 4}, diff --git a/tests/verify_write_once.erl b/tests/verify_write_once.erl index 02fcf41d4..71b7bcccf 100644 --- a/tests/verify_write_once.erl +++ b/tests/verify_write_once.erl @@ -304,7 +304,12 @@ config(RingSize, NVal) -> config(RingSize, NVal, Backend) -> [ {riak_core, [ - {default_bucket_props, [{n_val, NVal}]}, + {default_bucket_props, + [ + {n_val, NVal}, + {allow_mult, true}, + {dvv_enabled, true} + ]}, {vnode_management_timer, 1000}, {ring_creation_size, RingSize}] }, diff --git a/tests/yz_default_bucket_type_upgrade.erl b/tests/yz_default_bucket_type_upgrade.erl index f3d94e849..b61b4353c 100644 --- a/tests/yz_default_bucket_type_upgrade.erl +++ b/tests/yz_default_bucket_type_upgrade.erl @@ -38,7 +38,12 @@ [{riak_core, [ {ring_creation_size, 16}, - {default_bucket_props, [{n_val, ?N}]}, + {default_bucket_props, + [ + {n_val, ?N}, + {allow_mult, true}, + {dvv_enabled, true} + ]}, {anti_entropy_build_limit, {100, 1000}}, {anti_entropy_concurrency, 8} ]},