Skip to content

Commit 5f6a47a

Browse files
author
Doug Rohrer
committed
Update pipe_verify_handoff_blocking to provoke vnode_down errors.
Use intercepts to provoke an otherwise-occasional error where a vnode has been transferred after the vnode proxy has gotten its pid but before the work can be sent. Will fail until fix to `riak_pipe_vnode:queue_work_wait` is merged. See basho/riak_pipe#106 for the fix.
1 parent 5f01423 commit 5f6a47a

File tree

2 files changed

+59
-15
lines changed

2 files changed

+59
-15
lines changed

intercepts/riak_core_vnode_master_intercepts.erl

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,19 @@
88
forwardable :: boolean(),
99
opts = [] :: list()}).
1010

11+
-record(fitting,
12+
{
13+
pid :: pid(),
14+
ref :: reference(),
15+
chashfun,
16+
nval
17+
}).
18+
19+
-record(cmd_enqueue, {fitting :: #fitting{},
20+
input :: term(),
21+
timeout,
22+
usedpreflist}).
23+
1124
-define(M, riak_core_vnode_master_orig).
1225

1326

@@ -32,4 +45,33 @@ stop_vnode_after_bloom_fold_request_succeeds(IndexNode, Req, Sender, VMaster) ->
3245
?M:command_return_vnode_orig(IndexNode, Req, Sender, VMaster)
3346
end;
3447
false -> ?M:command_return_vnode_orig(IndexNode, Req, Sender, VMaster)
48+
end.
49+
50+
stop_pipe_vnode_after_request_sent(IndexNode, Req, Sender, VMaster) ->
51+
case Req of
52+
#cmd_enqueue{} = _Req ->
53+
%% ?I_INFO("Intercepting riak_core_vnode_master:command_returning_vnode"),
54+
random:seed(os:timestamp()),
55+
case random:uniform(20) of
56+
5 ->
57+
%% Simulate what happens when a VNode completes handoff between command_returning_vnode
58+
%% and the fold attempting to start - other attempts to intercept and slow
59+
%% certain parts of Riak to invoke the particular race condition were unsuccessful
60+
?I_INFO("Replaced VNode with spawned function in command_returning_vnode"),
61+
Runner = self(),
62+
VNodePid = spawn(fun() ->
63+
Runner ! go,
64+
exit(normal)
65+
end),
66+
receive
67+
go -> ok
68+
end,
69+
%% Still need to send the work
70+
?M:command_return_vnode_orig(IndexNode, Req, Sender, VMaster),
71+
{ok, VNodePid};
72+
_ ->
73+
?M:command_return_vnode_orig(IndexNode, Req, Sender, VMaster)
74+
end;
75+
_ ->
76+
?M:command_return_vnode_orig(IndexNode, Req, Sender, VMaster)
3577
end.

tests/pipe_verify_handoff_blocking.erl

Lines changed: 17 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -43,17 +43,6 @@
4343
%% archive), but we don't want them to process so many inputs that
4444
%% they consume their blocking queues before handing off.
4545

46-
%% Please Note: Under rare circumstances, this test may fail with a
47-
%% "{badmatch,{error,[{vnode_down,noproc}]}}' error. This is not a
48-
%% failure of this test but rather a side effect of a race condition
49-
%% in riak_core_vnode_proxy. It manifests due to the fact that the
50-
%% test is attempting to send a command to a vnode that is in fact
51-
%% down, however monitor only works by issuing a command and getting
52-
%% a PID. In some instances, get_vnode_pid fails because vnode shutdown
53-
%% is queued up in the mailbox before monitor node. Unfortunately, the
54-
%% fix would require a fundamental shift in the architecture of
55-
%% riak_core, which at the time of this writing is not feasible for
56-
%% this rare failure case.
5746
-module(pipe_verify_handoff_blocking).
5847

5948
-export([
@@ -80,9 +69,12 @@ confirm() ->
8069
Inputs = lists:seq(1, 20),
8170

8271
lager:info("Start ~b nodes", [?NODE_COUNT]),
83-
NodeDefs = lists:duplicate(?NODE_COUNT, {current, default}),
72+
Config = [{riak_core, [{ring_creation_size, 8},
73+
{vnode_management_timer, 1000},
74+
{handoff_concurrency, 100},
75+
{vnode_inactivity_timeout, 1000}]}],
8476
Services = [riak_pipe],
85-
[Primary,Secondary] = Nodes = rt:deploy_nodes(NodeDefs, Services),
77+
[Primary,Secondary] = Nodes = rt:deploy_nodes(?NODE_COUNT, Config, Services),
8678
%% Ensure each node owns 100% of it's own ring
8779
[?assertEqual([Node], rt:owners_according_to(Node)) || Node <- Nodes],
8880

@@ -114,14 +106,18 @@ confirm() ->
114106

115107
lager:info("Unpause workers"),
116108
Runner ! go,
109+
set_up_vnode_crashing_intercept(Primary),
117110

118111
ok = rt:wait_until_transfers_complete(Nodes),
119112

120113
FillerInputCount = stop_fillers(Fillers),
121114

122115
%% if we make it this far, then no filler ever saw the vnode_down
123116
%% error message; otherwise badmatches in queue_filler/4 will have
124-
%% halted the test
117+
%% halted the test. Note that we should not see this test fail
118+
%% with `{error,[{vnode_down,noproc}]}}` errors, as the `noproc` case
119+
%% should be handled similarly to the `normal` exit case in
120+
%% `riak_pipe_vnode:queue_work_wait`
125121

126122
_Status2 = pipe_status(Primary, Pipe),
127123

@@ -139,7 +135,13 @@ confirm() ->
139135
lager:info("~s: PASS", [atom_to_list(?MODULE)]),
140136
pass.
141137

142-
%%% queue filling
138+
set_up_vnode_crashing_intercept(Primary) ->
139+
lager:info("Add intercept to kill vnode before calling the wait function"),
140+
rt_intercept:add(Primary, {riak_core_vnode_master,
141+
[{{command_return_vnode, 4},
142+
stop_pipe_vnode_after_request_sent}]}).
143+
144+
%% queue filling
143145

144146
%% @doc fill pipe vnode queues by repeatedly sending each input in the
145147
%% input list until the queue reports timeout.

0 commit comments

Comments
 (0)