Skip to content

Commit 818645e

Browse files
author
Doug Rohrer
committed
Make tests pass regardless of order run by always writing default values for configuration settings, rather than trying to only change certain values.
- Always set values to keys that have no defaults in schema, as setting a value to `undefined` does not return the "default" value when application:get_env/3 is called - Provide an easier way to get the configuration by creating the config record, with appropriate defaults - Code cleanup on BKV1/2/3 - make them easier to read/understand and for future work in refactoring the tests and removing some of the complexity. - Now that test is running more consistently, tighten tests to actually test something in test_vnode_protection. - Bump vnode protection to ?THRESHOLD+1 as 2.0 seems to have one extra message. - Convert lager:info to lager:debug in successful get cases to reduce noise. - Remove use of ConsistentType (use macros & pass BKV down to validate).
1 parent b863efc commit 818645e

File tree

1 file changed

+89
-71
lines changed

1 file changed

+89
-71
lines changed

tests/overload.erl

Lines changed: 89 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -31,78 +31,104 @@
3131
-define(GET_RETRIES, 1000).
3232
-define(BUCKET, <<"test">>).
3333
-define(KEY, <<"hotkey">>).
34+
-define(NORMAL_TYPE, <<"normal_type">>).
35+
-define(CONSISTENT_TYPE, <<"consistent_type">>).
36+
-define(WRITE_ONCE_TYPE, <<"write_once_type">>).
37+
-define(NORMAL_BKV, {{?NORMAL_TYPE, ?BUCKET}, ?KEY, <<"test">>}).
38+
-define(CONSISTENT_BKV, {{?CONSISTENT_TYPE, ?BUCKET}, ?KEY, <<"test">>}).
39+
-define(WRITE_ONCE_BKV, {{?WRITE_ONCE_TYPE, ?BUCKET}, ?KEY, <<"test">>}).
40+
41+
%% This record contains the default values for config settings if they were not set
42+
%% in the advanced.config file - because setting something to `undefined` is not the same
43+
%% as not setting it at all, we need to make sure to overwrite with defaults for each test,
44+
%% not just set things back to `undefined`. Also, makes the tests re-orderable as they always
45+
%% set everything they need, and don't depend on a previous test to make changes.
46+
-record(config, {
47+
vnode_overload_threshold = 10000,
48+
vnode_check_interval = 5000,
49+
vnode_check_request_interval = 2500,
50+
fsm_limit=undefined}).
51+
52+
default_config() ->
53+
default_config(#config{}).
54+
55+
default_config(#config{
56+
vnode_overload_threshold=VnodeOverloadThreshold,
57+
vnode_check_interval = VnodeCheckInterval,
58+
vnode_check_request_interval = VnodeCheckRequestInterval,
59+
fsm_limit = FsmLimit
60+
}) ->
61+
[{riak_core, [{ring_creation_size, 8},
62+
{default_bucket_props, [{n_val, 5}]},
63+
{vnode_management_timer, 1000},
64+
{enable_health_checks, false},
65+
{enable_consensus, true},
66+
{vnode_overload_threshold, VnodeOverloadThreshold},
67+
{vnode_check_interval, VnodeCheckInterval},
68+
{vnode_check_request_interval, VnodeCheckRequestInterval}]},
69+
{riak_kv, [{fsm_limit, FsmLimit},
70+
{storage_backend, riak_kv_eleveldb_backend},
71+
{anti_entropy_build_limit, {100, 1000}},
72+
{anti_entropy_concurrency, 100},
73+
{anti_entropy_tick, 100},
74+
{anti_entropy, {on, []}},
75+
{anti_entropy_timeout, 5000}]},
76+
{riak_api, [{pb_backlog, 1024}]}].
3477

3578
confirm() ->
3679
Nodes = setup(),
3780

38-
NormalType = <<"normal_type">>,
39-
ConsistentType = <<"consistent_type">>,
40-
WriteOnceType = <<"write_once_type">>,
41-
42-
ok = create_bucket_type(Nodes, NormalType, [{n_val, 3}]),
43-
ok = create_bucket_type(Nodes, ConsistentType, [{consistent, true}, {n_val, 5}]),
44-
ok = create_bucket_type(Nodes, WriteOnceType, [{write_once, true}, {n_val, 1}]),
81+
ok = create_bucket_type(Nodes, ?NORMAL_TYPE, [{n_val, 3}]),
82+
ok = create_bucket_type(Nodes, ?CONSISTENT_TYPE, [{consistent, true}, {n_val, 5}]),
83+
ok = create_bucket_type(Nodes, ?WRITE_ONCE_TYPE, [{write_once, true}, {n_val, 1}]),
4584
rt:wait_until(ring_manager_check_fun(hd(Nodes))),
4685

47-
BKV1 = {{NormalType, ?BUCKET}, ?KEY, <<"test">>},
48-
BKV2 = {{ConsistentType, ?BUCKET}, ?KEY, <<"test">>},
49-
BKV3 = {{WriteOnceType, ?BUCKET}, ?KEY, <<"test">>},
86+
5087
Node1 = hd(Nodes),
51-
write_once(Node1, BKV1),
52-
write_once(Node1, BKV2),
53-
write_once(Node1, BKV3),
88+
write_once(Node1, ?NORMAL_BKV),
89+
write_once(Node1, ?CONSISTENT_BKV),
90+
write_once(Node1, ?WRITE_ONCE_BKV),
5491

5592
Tests = [test_no_overload_protection,
5693
test_vnode_protection,
57-
test_fsm_protection,
58-
test_cover_queries_overload],
94+
test_fsm_protection],
5995

6096
[begin
6197
lager:info("Starting Test ~p for ~p~n", [Test, BKV]),
62-
ok = erlang:apply(?MODULE, Test, [Nodes, BKV, IsConsistent])
98+
ok = erlang:apply(?MODULE, Test, [Nodes, BKV])
6399
end || Test <- Tests,
64-
{BKV, IsConsistent} <- [{BKV1, false},
65-
{BKV2, true},
66-
{BKV3, false}]],
100+
BKV <- [?NORMAL_BKV,
101+
?CONSISTENT_BKV,
102+
?WRITE_ONCE_BKV]],
103+
%% Test cover queries doesn't depend on bucket/keyvalue, just run it once
104+
test_cover_queries_overload(Nodes),
67105
pass.
68106

69107

70108
setup() ->
71109
ensemble_util:build_cluster(5, default_config(), 5).
72110

73-
default_config() ->
74-
[{riak_core, [{ring_creation_size, 8},
75-
{default_bucket_props, [{n_val, 5}]},
76-
{vnode_management_timer, 1000},
77-
{enable_health_checks, false},
78-
{enable_consensus, true},
79-
{vnode_overload_threshold, undefined}]},
80-
{riak_kv, [{fsm_limit, undefined},
81-
{storage_backend, riak_kv_eleveldb_backend},
82-
{anti_entropy_build_limit, {100, 1000}},
83-
{anti_entropy_concurrency, 100},
84-
{anti_entropy_tick, 100},
85-
{anti_entropy, {on, []}},
86-
{anti_entropy_timeout, 5000}]},
87-
{riak_api, [{pb_backlog, 1024}]}].
88-
89-
test_no_overload_protection(_Nodes, _BKV, true) ->
111+
test_no_overload_protection(_Nodes, ?CONSISTENT_BKV) ->
90112
ok;
91-
test_no_overload_protection(Nodes, BKV, ConsistentType) ->
113+
test_no_overload_protection(Nodes, BKV) ->
114+
lager:info("Setting default configuration for no overload protestion test."),
115+
rt:pmap(fun(Node) ->
116+
rt:update_app_config(Node, default_config())
117+
end, Nodes),
92118
lager:info("Testing with no overload protection"),
93119
ProcFun = build_predicate_eq(test_no_overload_protection, ?NUM_REQUESTS,
94120
"ProcFun", "Procs"),
95121
QueueFun = build_predicate_gte(test_no_overload_protection, ?NUM_REQUESTS,
96122
"QueueFun", "Queue Size"),
97-
verify_test_results(run_test(Nodes, BKV), ConsistentType, ProcFun, QueueFun).
123+
verify_test_results(run_test(Nodes, BKV), BKV, ProcFun, QueueFun).
98124

99-
verify_test_results({_NumProcs, QueueLen}, true, _, QueueFun) ->
125+
verify_test_results({_NumProcs, QueueLen}, ?CONSISTENT_BKV, _ProcFun, QueueFun) ->
100126
?assert(QueueFun(QueueLen));
101-
verify_test_results({NumProcs, QueueLen}, false, ProcFun, QueueFun) ->
127+
verify_test_results({NumProcs, QueueLen}, _BKV, ProcFun, QueueFun) ->
102128
?assert(ProcFun(NumProcs)),
103129
?assert(QueueFun(QueueLen)).
104130

105-
test_vnode_protection(Nodes, BKV, ConsistentType) ->
131+
test_vnode_protection(Nodes, BKV) ->
106132
%% Setting check_interval to one ensures that process_info is called
107133
%% to check the queue length on each vnode send.
108134
%% This allows us to artificially raise vnode queue lengths with dummy
@@ -111,14 +137,13 @@ test_vnode_protection(Nodes, BKV, ConsistentType) ->
111137
lager:info("Testing with vnode queue protection enabled"),
112138
lager:info("Setting vnode overload threshold to ~b", [?THRESHOLD]),
113139
lager:info("Setting vnode check interval to 1"),
114-
Config = [{riak_core, [{vnode_overload_threshold, ?THRESHOLD},
115-
{vnode_check_interval, 1}]}],
140+
Config = default_config(#config{vnode_overload_threshold=?THRESHOLD, vnode_check_interval=1}),
116141
rt:pmap(fun(Node) ->
117142
rt:update_app_config(Node, Config)
118143
end, Nodes),
119144
ProcFun = build_predicate_lt(test_vnode_protection, (?NUM_REQUESTS+1), "ProcFun", "Procs"),
120-
QueueFun = build_predicate_lt(test_vnode_protection, (?NUM_REQUESTS), "QueueFun", "QueueSize"),
121-
verify_test_results(run_test(Nodes, BKV), ConsistentType, ProcFun, QueueFun),
145+
QueueFun = build_predicate_lte(test_vnode_protection, (?THRESHOLD+1), "QueueFun", "QueueSize"),
146+
verify_test_results(run_test(Nodes, BKV), BKV, ProcFun, QueueFun),
122147

123148
[Node1 | _] = Nodes,
124149
CheckInterval = ?THRESHOLD div 2,
@@ -132,36 +157,31 @@ test_vnode_protection(Nodes, BKV, ConsistentType) ->
132157
Pid = suspend_vnode_proxy(Victim),
133158
ProcFun2 = build_predicate_gte("test_vnode_protection after suspend",
134159
(?NUM_REQUESTS), "ProcFun", "Procs"),
135-
QueueFun2 = build_predicate_lt("test_vnode_protection after suspend",
136-
(?NUM_REQUESTS), "QueueFun", "QueueSize"),
137-
verify_test_results(run_test(Nodes, BKV), ConsistentType, ProcFun2, QueueFun2),
160+
QueueFun2 = build_predicate_lte("test_vnode_protection after suspend",
161+
(?THRESHOLD+1), "QueueFun", "QueueSize"),
162+
verify_test_results(run_test(Nodes, BKV), BKV, ProcFun2, QueueFun2),
138163
Pid ! resume,
139164
ok.
140165

141166
%% Don't check on fast path
142-
test_fsm_protection(_, {{<<"write_once_type">>, _}, _, _}, _) ->
167+
test_fsm_protection(_, ?WRITE_ONCE_BKV) ->
143168
ok;
144-
%% Or consistent path - doesn't use FSMs either
145-
test_fsm_protection(_, _, true) ->
169+
%% Or consistent gets, as they don't use the FSM either
170+
test_fsm_protection(_, ?CONSISTENT_BKV) ->
146171
ok;
147-
test_fsm_protection(Nodes, BKV, false) ->
172+
test_fsm_protection(Nodes, BKV) ->
148173
lager:info("Testing with coordinator protection enabled"),
149174
lager:info("Setting FSM limit to ~b", [?THRESHOLD]),
150-
Config = [{riak_kv, [
151-
{fsm_limit, ?THRESHOLD},
152-
{vnode_overload_threshold, undefined},
153-
{vnode_check_interval, undefined}]}],
175+
%% Set FSM limit and reset other changes from previous tests.
176+
Config = default_config(#config{fsm_limit=?THRESHOLD}),
154177
rt:pmap(fun(Node) ->
155178
rt:update_app_config(Node, Config)
156179
end, Nodes),
157180
Node1 = hd(Nodes),
158181

159-
%% TODO: Figure out why just using rt:wait_for_service completely breaks this test,
160-
%% but not waiting for riak_kv leaves us open to a race where the resource doesn't exist yet.
161-
%% Do the retry dance instead for now inside get_calculated_sj_limit.
162182
rt:wait_for_cluster_service(Nodes, riak_kv),
163183
rt:load_modules_on_nodes([?MODULE], Nodes),
164-
{ok, ExpectedFsms} = get_calculated_sj_limit(Node1, riak_kv_get_fsm_sj),
184+
{ok, ExpectedFsms} = get_calculated_sj_limit(Node1, riak_kv_get_fsm_sj, 1),
165185

166186
%% We expect exactly ExpectedFsms, but because of a race in SideJob we sometimes get 1 more
167187
%% Adding 2 (the highest observed rasce to date) to the lte predicate to handle the occasional case.
@@ -170,7 +190,7 @@ test_fsm_protection(Nodes, BKV, false) ->
170190
"ProcFun", "Procs"),
171191
QueueFun = build_predicate_lt(test_fsm_protection, (?NUM_REQUESTS),
172192
"QueueFun", "QueueSize"),
173-
verify_test_results(run_test(Nodes, BKV), false, ProcFun, QueueFun),
193+
verify_test_results(run_test(Nodes, BKV), BKV, ProcFun, QueueFun),
174194

175195
ok.
176196

@@ -191,16 +211,14 @@ get_calculated_sj_limit(Node, ResourceName, Retries) when Retries > 0 ->
191211
get_calculated_sj_limit(Node, ResourceName, Retries) when Retries == 0 ->
192212
{error, io_lib:format("Failed to retrieve sidejob limit from ~p for resource ~p. Giving up.", [Node, ResourceName])}.
193213

194-
test_cover_queries_overload(_Nodes, _, true) ->
195-
ok;
196-
test_cover_queries_overload(Nodes, _, false) ->
214+
test_cover_queries_overload(Nodes) ->
197215
lager:info("Testing cover queries with vnode queue protection enabled"),
198216
lager:info("Setting vnode overload threshold to ~b", [?THRESHOLD]),
199217
lager:info("Setting vnode check interval to 1"),
200218

201-
Config = [{riak_core, [{vnode_overload_threshold, ?THRESHOLD},
202-
{vnode_check_request_interval, 2},
203-
{vnode_check_interval, 1}]}],
219+
Config = default_config(#config{vnode_overload_threshold=?THRESHOLD,
220+
vnode_check_request_interval=2,
221+
vnode_check_interval=1}),
204222
rt:pmap(fun(Node) ->
205223
rt:update_app_config(Node, Config)
206224
end, Nodes),
@@ -363,17 +381,17 @@ pb_get_fun(Node, Bucket, Key, TestPid) ->
363381
PBC = rt:pbc(Node),
364382
Result = case catch riakc_pb_socket:get(PBC, Bucket, Key) of
365383
{error, <<"overload">>} ->
366-
lager:info("overload detected in pb_get, continuing..."),
384+
lager:debug("overload detected in pb_get, continuing..."),
367385
true;
368386
%% we expect timeouts in this test as we've shut down a vnode - return true in this case
369387
{error, timeout} ->
370-
lager:info("timeout detected in pb_get, continuing..."),
388+
lager:debug("timeout detected in pb_get, continuing..."),
371389
true;
372390
{error, <<"timeout">>} ->
373-
lager:info("timeout detected in pb_get, continuing..."),
391+
lager:debug("timeout detected in pb_get, continuing..."),
374392
true;
375393
{ok, Res} ->
376-
lager:info("riakc_pb_socket:get(~p, ~p, ~p) succeeded, Res:~p", [PBC, Bucket, Key, Res]),
394+
lager:debug("riakc_pb_socket:get(~p, ~p, ~p) succeeded, Res:~p", [PBC, Bucket, Key, Res]),
377395
true;
378396
{error, Type} ->
379397
lager:error("riakc_pb_socket threw error ~p reading {~p, ~p}, retrying...", [Type, Bucket, Key]),

0 commit comments

Comments
 (0)