Skip to content

Commit 4d60fe2

Browse files
committed
fix(schema): avoid deadlock on reload_schema error
Ensure reload_schema always clears reload_in_progress flag and broadcasts condition variable even if schema reload fails or times out. Previously, an early return on error left reload_in_progress set to true, causing subsequent crud.schema() calls to wait until timeout indefinitely. Add integration test reproducing the issue and verifying that schema reload can be retried successfully after a failure. Fixes #479
1 parent 5efb264 commit 4d60fe2

File tree

3 files changed

+70
-11
lines changed

3 files changed

+70
-11
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
1515
* Move bucket_unref out of transaction.
1616
* Prevent duplicate metrics if init function is called multiple times.
1717
* Prevent creating duplicate triggers on `_crud_settings_local` space if init function is called multiple times.
18+
* Fix deadlock in `crud.schema()` after schema reload error (#479).
1819

1920
## [1.7.2] - 28-01-26
2021

crud/common/schema.lua

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -58,28 +58,33 @@ function schema.reload_schema(vshard_router)
5858
local replicasets = vshard_router:routeall()
5959
local vshard_router_name = vshard_router.name
6060

61+
if reload_schema_cond[vshard_router_name] == nil then
62+
reload_schema_cond[vshard_router_name] = fiber.cond()
63+
end
64+
6165
if reload_in_progress[vshard_router_name] == true then
6266
if not reload_schema_cond[vshard_router_name]:wait(const.RELOAD_SCHEMA_TIMEOUT) then
6367
return nil, ReloadSchemaError:new('Waiting for schema to be reloaded is timed out')
6468
end
65-
else
66-
reload_in_progress[vshard_router_name] = true
67-
if reload_schema_cond[vshard_router_name] == nil then
68-
reload_schema_cond[vshard_router_name] = fiber.cond()
69-
end
69+
return true
70+
end
7071

71-
local ok, err = call_reload_schema(replicasets)
72-
if not ok then
73-
return nil, err
74-
end
72+
reload_in_progress[vshard_router_name] = true
73+
74+
local ok, err = call_reload_schema(replicasets)
75+
76+
reload_in_progress[vshard_router_name] = false
7577

76-
reload_schema_cond[vshard_router_name]:broadcast()
77-
reload_in_progress[vshard_router_name] = false
78+
if not ok then
79+
return nil, err
7880
end
7981

82+
reload_schema_cond[vshard_router_name]:broadcast()
83+
8084
return true
8185
end
8286

87+
8388
-- schema.wrap_func_reload calls func with specified arguments.
8489
-- func should return `res, err, need_reload`
8590
-- If function returned error and `need_reload` is true,
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
local t = require('luatest')
2+
local helpers = require('test.helper')
3+
4+
local pgroup = t.group('schema_reload_deadlock', helpers.backend_matrix({
5+
{ engine = 'memtx' },
6+
}))
7+
8+
pgroup.before_all(function(g)
9+
helpers.start_default_cluster(g, 'srv_select')
10+
end)
11+
12+
pgroup.after_all(function(g)
13+
helpers.stop_cluster(g.cluster, g.params.backend)
14+
end)
15+
16+
pgroup.before_each(function(g)
17+
g._old_reload_schema_timeout = g.router:exec(function(new_timeout)
18+
local const = require('crud.common.const')
19+
local old = const.RELOAD_SCHEMA_TIMEOUT
20+
const.RELOAD_SCHEMA_TIMEOUT = new_timeout
21+
return old
22+
end, { 0.2 })
23+
end)
24+
25+
pgroup.after_each(function(g)
26+
if g._old_reload_schema_timeout ~= nil then
27+
g.router:exec(function(prev_timeout)
28+
local const = require('crud.common.const')
29+
const.RELOAD_SCHEMA_TIMEOUT = prev_timeout
30+
end, { g._old_reload_schema_timeout })
31+
32+
g._old_reload_schema_timeout = nil
33+
end
34+
end)
35+
36+
pgroup.test_schema_reload_in_progress_is_cleared_after_error = function(g)
37+
local _, err = g.router:call('crud.schema')
38+
t.assert_equals(err, nil)
39+
40+
t.assert_is_not(g.cluster:server('s2-master'), nil)
41+
g.cluster:server('s2-master'):stop()
42+
43+
local _, err1 = g.router:call('crud.schema')
44+
t.assert_is_not(err1, nil)
45+
t.assert_str_contains(err1.err, 'timed out')
46+
47+
g.cluster:server('s2-master'):start()
48+
49+
t.helpers.retrying({ timeout = 30, delay = 0.1 }, function()
50+
local _, err2 = g.router:call('crud.schema')
51+
t.assert_equals(err2, nil)
52+
end)
53+
end

0 commit comments

Comments
 (0)