Skip to content

Commit 9942499

Browse files
nshysergepetrenko
authored andcommitted
box: rollback txns in WAL queue before in-flight tnxs
In case of WAL error we should rollback txns in WAL queue before any in-flight txns (already submitted to WAL thread). Work around is to disable WAL queue by `box.cfg{wal_queue_max_size = 0}` so that no request can be queued. Closes tarantool#11179 NO_DOC=bugfix
1 parent f1ccb2f commit 9942499

File tree

5 files changed

+167
-0
lines changed

5 files changed

+167
-0
lines changed
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
## bugfix/box
2+
3+
* Fixed a bug when transactions in the WAL queue were not rolled back on WAL
4+
error (gh-11179).

src/box/journal.c

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,21 @@ journal_queue_flush(void)
162162
fiber_sleep(0);
163163
}
164164

165+
void
166+
journal_queue_rollback(void)
167+
{
168+
struct stailq rollback;
169+
stailq_create(&rollback);
170+
stailq_concat(&rollback, &journal_queue.requests);
171+
stailq_reverse(&rollback);
172+
struct journal_entry *req;
173+
stailq_foreach_entry(req, &rollback, fifo) {
174+
req->res = JOURNAL_ENTRY_ERR_CASCADE;
175+
req->is_complete = true;
176+
req->write_async_cb(req);
177+
}
178+
}
179+
165180
int
166181
journal_write_row(struct xrow_header *row)
167182
{

src/box/journal.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,10 @@ journal_queue_on_complete(const struct journal_entry *entry)
229229
assert(journal_queue.size >= 0);
230230
}
231231

232+
/** Rollback all txns waiting in queue. */
233+
void
234+
journal_queue_rollback(void);
235+
232236
/**
233237
* Complete asynchronous write.
234238
*/
@@ -271,6 +275,7 @@ journal_write_submit(struct journal_entry *entry)
271275
journal_queue_on_append(entry);
272276
if (current_journal->write_async(current_journal, entry) != 0) {
273277
journal_queue_on_complete(entry);
278+
journal_queue_rollback();
274279
return -1;
275280
}
276281
return 0;

src/box/wal.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -337,6 +337,7 @@ tx_complete_rollback(void)
337337
if (stailq_last_entry(&writer->rollback, struct journal_entry,
338338
fifo) != writer->last_entry)
339339
return;
340+
journal_queue_rollback();
340341
stailq_reverse(&writer->rollback);
341342
tx_schedule_queue(&writer->rollback);
342343
/* TX-thread can try sending transactions to WAL again. */
Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
local server = require('luatest.server')
2+
local t = require('luatest')
3+
4+
local g = t.group()
5+
6+
g.before_all(function(cg)
7+
t.tarantool.skip_if_not_debug()
8+
cg.server = server:new()
9+
cg.server:start()
10+
end)
11+
12+
g.after_all(function(cg)
13+
cg.server:drop()
14+
end)
15+
16+
g.after_test('test_wal_queue_rollback_in_flight', function(cg)
17+
cg.server:exec(function()
18+
box.error.injection.set('ERRINJ_WAL_DELAY', false)
19+
box.error.injection.set('ERRINJ_WAL_WRITE', false)
20+
box.space.test:drop()
21+
end)
22+
end)
23+
24+
g.test_wal_queue_rollback_in_flight = function(cg)
25+
cg.server:exec(function()
26+
local fiber = require('fiber')
27+
local s = box.schema.create_space('test')
28+
s:create_index('pk')
29+
box.cfg{wal_queue_max_size = 100}
30+
s:insert({1})
31+
box.error.injection.set('ERRINJ_WAL_DELAY', true)
32+
-- In case txn in WAL queue (f2) is not rollbacked we get duplicate
33+
-- error on rollback of in-flight txn (f1) and as a result failed
34+
-- assertion or panic.
35+
local f1 = fiber.new(function()
36+
box.begin()
37+
s:delete({1})
38+
s:insert({100, string.rep('a', 1000)})
39+
box.commit()
40+
end)
41+
f1:set_joinable(true)
42+
fiber.yield()
43+
local f2 = fiber.new(function()
44+
s:insert({1})
45+
end)
46+
f2:set_joinable(true)
47+
fiber.yield()
48+
box.error.injection.set('ERRINJ_WAL_WRITE', true)
49+
box.error.injection.set('ERRINJ_WAL_DELAY', false)
50+
local ok, err = f1:join()
51+
t.assert_not(ok)
52+
t.assert_covers(err:unpack(), {
53+
type = 'ClientError',
54+
code = box.error.WAL_IO,
55+
message = 'Failed to write to disk',
56+
})
57+
local ok, err = f2:join()
58+
t.assert_not(ok)
59+
t.assert_covers(err:unpack(), {
60+
type = 'ClientError',
61+
code = box.error.CASCADE_ROLLBACK,
62+
message = 'WAL has a rollback in progress',
63+
})
64+
t.assert_equals(s:select(), {{1}})
65+
end)
66+
end
67+
68+
g.after_test('test_wal_queue_rollback_cascade', function(cg)
69+
cg.server:exec(function()
70+
box.error.injection.set('ERRINJ_WAL_DELAY', false)
71+
box.error.injection.set('ERRINJ_WAL_IO', false)
72+
box.space.test:drop()
73+
end)
74+
end)
75+
76+
--
77+
-- Here we test a different situation. Part of in-flight requests can be
78+
-- successfully written and part of requests are not. Imagine also in-flight
79+
-- requests are split into 2 batches. The first has both successful and not
80+
-- successful and the second has only unsuccessful requests. So when first
81+
-- batch is returned to TX thread we don't proceed with rollback yet
82+
-- waiting for the second batch. But we complete the successful part of
83+
-- the batch and thus wakeup journal queue. Woken up fiber will try
84+
-- to submit new request to WAL but fail as cascade rollback is in
85+
-- progress. The failed request from journal will be rolled back and
86+
-- this is not correct. First we should rollback newer requests form the
87+
-- journal queue.
88+
--
89+
-- Why do we need two batches here? With only one batch we first rollback
90+
-- failed requests from batch and thus rollback journal queue.
91+
--
92+
-- Note also that this situation it hard to reproduce directly. Thus it
93+
-- modelled here by ERRINJ_WAL_IO injection.
94+
--
95+
g.test_wal_queue_rollback_cascade = function(cg)
96+
cg.server:exec(function()
97+
local fiber = require('fiber')
98+
local s = box.schema.create_space('test')
99+
s:create_index('pk')
100+
s:insert({1})
101+
box.cfg{wal_queue_max_size = 100}
102+
box.error.injection.set('ERRINJ_WAL_DELAY', true)
103+
local f1 = fiber.new(function()
104+
box.begin()
105+
s:insert({100, string.rep('a', 1000)})
106+
box.commit()
107+
end)
108+
f1:set_joinable(true)
109+
fiber.yield()
110+
-- In case txn in WAL queue (f3) is not rollbacked we get duplicate
111+
-- error on rollback of in-flight txn (f2) and as a result failed
112+
-- assertion or panic.
113+
local f2 = fiber.new(function()
114+
s:delete({1})
115+
end)
116+
f2:set_joinable(true)
117+
fiber.yield()
118+
local f3 = fiber.new(function()
119+
s:insert({1})
120+
end)
121+
f3:set_joinable(true)
122+
fiber.yield()
123+
box.error.injection.set('ERRINJ_WAL_IO', true)
124+
box.error.injection.set('ERRINJ_WAL_DELAY', false)
125+
t.assert_equals({f1:join()}, {true})
126+
local ok, err = f2:join()
127+
t.assert_not(ok)
128+
t.assert_covers(err:unpack(), {
129+
type = 'ClientError',
130+
code = box.error.WAL_IO,
131+
message = 'Failed to write to disk',
132+
})
133+
local ok, err = f3:join()
134+
t.assert_not(ok)
135+
t.assert_covers(err:unpack(), {
136+
type = 'ClientError',
137+
code = box.error.CASCADE_ROLLBACK,
138+
message = 'WAL has a rollback in progress',
139+
})
140+
t.assert_equals(s:select({100}, {iterator = 'lt'}), {{1}})
141+
end)
142+
end

0 commit comments

Comments
 (0)