3131-define (GET_RETRIES , 1000 ).
3232-define (BUCKET , <<" test" >>).
3333-define (KEY , <<" hotkey" >>).
34+ -define (NORMAL_TYPE , <<" normal_type" >>).
35+ -define (CONSISTENT_TYPE , <<" consistent_type" >>).
36+ -define (WRITE_ONCE_TYPE , <<" write_once_type" >>).
37+ -define (NORMAL_BKV , {{? NORMAL_TYPE , ? BUCKET }, ? KEY , <<" test" >>}).
38+ -define (CONSISTENT_BKV , {{? CONSISTENT_TYPE , ? BUCKET }, ? KEY , <<" test" >>}).
39+ -define (WRITE_ONCE_BKV , {{? WRITE_ONCE_TYPE , ? BUCKET }, ? KEY , <<" test" >>}).
40+
41+ % % This record contains the default values for config settings if they were not set
42+ % % in the advanced.config file - because setting something to `undefined` is not the same
43+ % % as not setting it at all, we need to make sure to overwrite with defaults for each test,
44+ % % not just set things back to `undefined`. Also, makes the tests re-orderable as they always
45+ % % set everything they need, and don't depend on a previous test to make changes.
46+ -record (config , {
47+ vnode_overload_threshold = 10000 ,
48+ vnode_check_interval = 5000 ,
49+ vnode_check_request_interval = 2500 ,
50+ fsm_limit = undefined }).
51+
52+ default_config () ->
53+ default_config (# config {}).
54+
55+ default_config (# config {
56+ vnode_overload_threshold = VnodeOverloadThreshold ,
57+ vnode_check_interval = VnodeCheckInterval ,
58+ vnode_check_request_interval = VnodeCheckRequestInterval ,
59+ fsm_limit = FsmLimit
60+ }) ->
61+ [{riak_core , [{ring_creation_size , 8 },
62+ {default_bucket_props , [{n_val , 5 }]},
63+ {vnode_management_timer , 1000 },
64+ {enable_health_checks , false },
65+ {enable_consensus , true },
66+ {vnode_overload_threshold , VnodeOverloadThreshold },
67+ {vnode_check_interval , VnodeCheckInterval },
68+ {vnode_check_request_interval , VnodeCheckRequestInterval }]},
69+ {riak_kv , [{fsm_limit , FsmLimit },
70+ {storage_backend , riak_kv_eleveldb_backend },
71+ {anti_entropy_build_limit , {100 , 1000 }},
72+ {anti_entropy_concurrency , 100 },
73+ {anti_entropy_tick , 100 },
74+ {anti_entropy , {on , []}},
75+ {anti_entropy_timeout , 5000 }]},
76+ {riak_api , [{pb_backlog , 1024 }]}].
3477
3578confirm () ->
3679 Nodes = setup (),
3780
38- NormalType = <<" normal_type" >>,
39- ConsistentType = <<" consistent_type" >>,
40- WriteOnceType = <<" write_once_type" >>,
41-
42- ok = create_bucket_type (Nodes , NormalType , [{n_val , 3 }]),
43- ok = create_bucket_type (Nodes , ConsistentType , [{consistent , true }, {n_val , 5 }]),
44- ok = create_bucket_type (Nodes , WriteOnceType , [{write_once , true }, {n_val , 1 }]),
81+ ok = create_bucket_type (Nodes , ? NORMAL_TYPE , [{n_val , 3 }]),
82+ ok = create_bucket_type (Nodes , ? CONSISTENT_TYPE , [{consistent , true }, {n_val , 5 }]),
83+ ok = create_bucket_type (Nodes , ? WRITE_ONCE_TYPE , [{write_once , true }, {n_val , 1 }]),
4584 rt :wait_until (ring_manager_check_fun (hd (Nodes ))),
4685
47- BKV1 = {{NormalType , ? BUCKET }, ? KEY , <<" test" >>},
48- BKV2 = {{ConsistentType , ? BUCKET }, ? KEY , <<" test" >>},
49- BKV3 = {{WriteOnceType , ? BUCKET }, ? KEY , <<" test" >>},
86+
5087 Node1 = hd (Nodes ),
51- write_once (Node1 , BKV1 ),
52- write_once (Node1 , BKV2 ),
53- write_once (Node1 , BKV3 ),
88+ write_once (Node1 , ? NORMAL_BKV ),
89+ write_once (Node1 , ? CONSISTENT_BKV ),
90+ write_once (Node1 , ? WRITE_ONCE_BKV ),
5491
5592 Tests = [test_no_overload_protection ,
5693 test_vnode_protection ,
57- test_fsm_protection ,
58- test_cover_queries_overload ],
94+ test_fsm_protection ],
5995
6096 [begin
6197 lager :info (" Starting Test ~p for ~p~n " , [Test , BKV ]),
62- ok = erlang :apply (? MODULE , Test , [Nodes , BKV , IsConsistent ])
98+ ok = erlang :apply (? MODULE , Test , [Nodes , BKV ])
6399 end || Test <- Tests ,
64- {BKV , IsConsistent } <- [{BKV1 , false },
65- {BKV2 , true },
66- {BKV3 , false }]],
100+ BKV <- [? NORMAL_BKV ,
101+ ? CONSISTENT_BKV ,
102+ ? WRITE_ONCE_BKV ]],
103+ % % Test cover queries doesn't depend on bucket/keyvalue, just run it once
104+ test_cover_queries_overload (Nodes ),
67105 pass .
68106
69107
70108setup () ->
71109 ensemble_util :build_cluster (5 , default_config (), 5 ).
72110
73- default_config () ->
74- [{riak_core , [{ring_creation_size , 8 },
75- {default_bucket_props , [{n_val , 5 }]},
76- {vnode_management_timer , 1000 },
77- {enable_health_checks , false },
78- {enable_consensus , true },
79- {vnode_overload_threshold , undefined }]},
80- {riak_kv , [{fsm_limit , undefined },
81- {storage_backend , riak_kv_eleveldb_backend },
82- {anti_entropy_build_limit , {100 , 1000 }},
83- {anti_entropy_concurrency , 100 },
84- {anti_entropy_tick , 100 },
85- {anti_entropy , {on , []}},
86- {anti_entropy_timeout , 5000 }]},
87- {riak_api , [{pb_backlog , 1024 }]}].
88-
89- test_no_overload_protection (_Nodes , _BKV , true ) ->
111+ test_no_overload_protection (_Nodes , ? CONSISTENT_BKV ) ->
90112 ok ;
91- test_no_overload_protection (Nodes , BKV , ConsistentType ) ->
113+ test_no_overload_protection (Nodes , BKV ) ->
114+ lager :info (" Setting default configuration for no overload protestion test." ),
115+ rt :pmap (fun (Node ) ->
116+ rt :update_app_config (Node , default_config ())
117+ end , Nodes ),
92118 lager :info (" Testing with no overload protection" ),
93119 ProcFun = build_predicate_eq (test_no_overload_protection , ? NUM_REQUESTS ,
94120 " ProcFun" , " Procs" ),
95121 QueueFun = build_predicate_gte (test_no_overload_protection , ? NUM_REQUESTS ,
96122 " QueueFun" , " Queue Size" ),
97- verify_test_results (run_test (Nodes , BKV ), ConsistentType , ProcFun , QueueFun ).
123+ verify_test_results (run_test (Nodes , BKV ), BKV , ProcFun , QueueFun ).
98124
99- verify_test_results ({_NumProcs , QueueLen }, true , _ , QueueFun ) ->
125+ verify_test_results ({_NumProcs , QueueLen }, ? CONSISTENT_BKV , _ProcFun , QueueFun ) ->
100126 ? assert (QueueFun (QueueLen ));
101- verify_test_results ({NumProcs , QueueLen }, false , ProcFun , QueueFun ) ->
127+ verify_test_results ({NumProcs , QueueLen }, _BKV , ProcFun , QueueFun ) ->
102128 ? assert (ProcFun (NumProcs )),
103129 ? assert (QueueFun (QueueLen )).
104130
105- test_vnode_protection (Nodes , BKV , ConsistentType ) ->
131+ test_vnode_protection (Nodes , BKV ) ->
106132 % % Setting check_interval to one ensures that process_info is called
107133 % % to check the queue length on each vnode send.
108134 % % This allows us to artificially raise vnode queue lengths with dummy
@@ -111,14 +137,13 @@ test_vnode_protection(Nodes, BKV, ConsistentType) ->
111137 lager :info (" Testing with vnode queue protection enabled" ),
112138 lager :info (" Setting vnode overload threshold to ~b " , [? THRESHOLD ]),
113139 lager :info (" Setting vnode check interval to 1" ),
114- Config = [{riak_core , [{vnode_overload_threshold , ? THRESHOLD },
115- {vnode_check_interval , 1 }]}],
140+ Config = default_config (# config {vnode_overload_threshold = ? THRESHOLD , vnode_check_interval = 1 }),
116141 rt :pmap (fun (Node ) ->
117142 rt :update_app_config (Node , Config )
118143 end , Nodes ),
119144 ProcFun = build_predicate_lt (test_vnode_protection , (? NUM_REQUESTS + 1 ), " ProcFun" , " Procs" ),
120- QueueFun = build_predicate_lt (test_vnode_protection , (? NUM_REQUESTS ), " QueueFun" , " QueueSize" ),
121- verify_test_results (run_test (Nodes , BKV ), ConsistentType , ProcFun , QueueFun ),
145+ QueueFun = build_predicate_lte (test_vnode_protection , (? THRESHOLD + 1 ), " QueueFun" , " QueueSize" ),
146+ verify_test_results (run_test (Nodes , BKV ), BKV , ProcFun , QueueFun ),
122147
123148 [Node1 | _ ] = Nodes ,
124149 CheckInterval = ? THRESHOLD div 2 ,
@@ -132,36 +157,31 @@ test_vnode_protection(Nodes, BKV, ConsistentType) ->
132157 Pid = suspend_vnode_proxy (Victim ),
133158 ProcFun2 = build_predicate_gte (" test_vnode_protection after suspend" ,
134159 (? NUM_REQUESTS ), " ProcFun" , " Procs" ),
135- QueueFun2 = build_predicate_lt (" test_vnode_protection after suspend" ,
136- (? NUM_REQUESTS ), " QueueFun" , " QueueSize" ),
137- verify_test_results (run_test (Nodes , BKV ), ConsistentType , ProcFun2 , QueueFun2 ),
160+ QueueFun2 = build_predicate_lte (" test_vnode_protection after suspend" ,
161+ (? THRESHOLD + 1 ), " QueueFun" , " QueueSize" ),
162+ verify_test_results (run_test (Nodes , BKV ), BKV , ProcFun2 , QueueFun2 ),
138163 Pid ! resume ,
139164 ok .
140165
141166% % Don't check on fast path
142- test_fsm_protection (_ , {{<< " write_once_type " >>, _ }, _ , _ }, _ ) ->
167+ test_fsm_protection (_ , ? WRITE_ONCE_BKV ) ->
143168 ok ;
144- % % Or consistent path - doesn 't use FSMs either
145- test_fsm_protection (_ , _ , true ) ->
169+ % % Or consistent gets, as they don 't use the FSM either
170+ test_fsm_protection (_ , ? CONSISTENT_BKV ) ->
146171 ok ;
147- test_fsm_protection (Nodes , BKV , false ) ->
172+ test_fsm_protection (Nodes , BKV ) ->
148173 lager :info (" Testing with coordinator protection enabled" ),
149174 lager :info (" Setting FSM limit to ~b " , [? THRESHOLD ]),
150- Config = [{riak_kv , [
151- {fsm_limit , ? THRESHOLD },
152- {vnode_overload_threshold , undefined },
153- {vnode_check_interval , undefined }]}],
175+ % % Set FSM limit and reset other changes from previous tests.
176+ Config = default_config (# config {fsm_limit = ? THRESHOLD }),
154177 rt :pmap (fun (Node ) ->
155178 rt :update_app_config (Node , Config )
156179 end , Nodes ),
157180 Node1 = hd (Nodes ),
158181
159- % % TODO: Figure out why just using rt:wait_for_service completely breaks this test,
160- % % but not waiting for riak_kv leaves us open to a race where the resource doesn't exist yet.
161- % % Do the retry dance instead for now inside get_calculated_sj_limit.
162182 rt :wait_for_cluster_service (Nodes , riak_kv ),
163183 rt :load_modules_on_nodes ([? MODULE ], Nodes ),
164- {ok , ExpectedFsms } = get_calculated_sj_limit (Node1 , riak_kv_get_fsm_sj ),
184+ {ok , ExpectedFsms } = get_calculated_sj_limit (Node1 , riak_kv_get_fsm_sj , 1 ),
165185
166186 % % We expect exactly ExpectedFsms, but because of a race in SideJob we sometimes get 1 more
167187 % % Adding 2 (the highest observed rasce to date) to the lte predicate to handle the occasional case.
@@ -170,7 +190,7 @@ test_fsm_protection(Nodes, BKV, false) ->
170190 " ProcFun" , " Procs" ),
171191 QueueFun = build_predicate_lt (test_fsm_protection , (? NUM_REQUESTS ),
172192 " QueueFun" , " QueueSize" ),
173- verify_test_results (run_test (Nodes , BKV ), false , ProcFun , QueueFun ),
193+ verify_test_results (run_test (Nodes , BKV ), BKV , ProcFun , QueueFun ),
174194
175195 ok .
176196
@@ -191,16 +211,14 @@ get_calculated_sj_limit(Node, ResourceName, Retries) when Retries > 0 ->
191211get_calculated_sj_limit (Node , ResourceName , Retries ) when Retries == 0 ->
192212 {error , io_lib :format (" Failed to retrieve sidejob limit from ~p for resource ~p . Giving up." , [Node , ResourceName ])}.
193213
194- test_cover_queries_overload (_Nodes , _ , true ) ->
195- ok ;
196- test_cover_queries_overload (Nodes , _ , false ) ->
214+ test_cover_queries_overload (Nodes ) ->
197215 lager :info (" Testing cover queries with vnode queue protection enabled" ),
198216 lager :info (" Setting vnode overload threshold to ~b " , [? THRESHOLD ]),
199217 lager :info (" Setting vnode check interval to 1" ),
200218
201- Config = [{ riak_core , [{ vnode_overload_threshold , ? THRESHOLD } ,
202- { vnode_check_request_interval , 2 } ,
203- { vnode_check_interval , 1 }]}] ,
219+ Config = default_config ( # config { vnode_overload_threshold = ? THRESHOLD ,
220+ vnode_check_request_interval = 2 ,
221+ vnode_check_interval = 1 }) ,
204222 rt :pmap (fun (Node ) ->
205223 rt :update_app_config (Node , Config )
206224 end , Nodes ),
@@ -363,17 +381,17 @@ pb_get_fun(Node, Bucket, Key, TestPid) ->
363381 PBC = rt :pbc (Node ),
364382 Result = case catch riakc_pb_socket :get (PBC , Bucket , Key ) of
365383 {error , <<" overload" >>} ->
366- lager :info (" overload detected in pb_get, continuing..." ),
384+ lager :debug (" overload detected in pb_get, continuing..." ),
367385 true ;
368386 % % we expect timeouts in this test as we've shut down a vnode - return true in this case
369387 {error , timeout } ->
370- lager :info (" timeout detected in pb_get, continuing..." ),
388+ lager :debug (" timeout detected in pb_get, continuing..." ),
371389 true ;
372390 {error , <<" timeout" >>} ->
373- lager :info (" timeout detected in pb_get, continuing..." ),
391+ lager :debug (" timeout detected in pb_get, continuing..." ),
374392 true ;
375393 {ok , Res } ->
376- lager :info (" riakc_pb_socket:get(~p , ~p , ~p ) succeeded, Res:~p " , [PBC , Bucket , Key , Res ]),
394+ lager :debug (" riakc_pb_socket:get(~p , ~p , ~p ) succeeded, Res:~p " , [PBC , Bucket , Key , Res ]),
377395 true ;
378396 {error , Type } ->
379397 lager :error (" riakc_pb_socket threw error ~p reading {~p , ~p }, retrying..." , [Type , Bucket , Key ]),
0 commit comments