Skip to content

Commit cd82d64

Browse files
georgiy-belyaninTotktonada
authored andcommitted
config: don't choose learners as bootstrap leaders
This patch makes Tarantool configured in `replication.failover = 'supervised'` mode not choose instances marked as learners for the failover coordinator as a bootstrap leader. Marking instance as a learner means it shouldn't be appointed as a master by the failover. Thus, the user would expect for the learners not to become RW during the bootstrap too. Example. ``` replication: failover: supervised failover: replicasets: r-001: learners: - i-001 groups: g-001: replicasets: r-001: instances: i-001: {} i-002: {} ``` `i-002` will be chosen as a replicaset bootstrap leader. If all non-anonymous instances are marked as learners the initial cluster bootstrap fails since every instance starts in RO and none of them might become the leader. If user applies a config where all non-anonymous instances marked as learners an alert is issued since this config is likely to be wrong. ``` log.lua:74 W> box_cfg.apply: cannot determine a bootstrap leader based on the configuration. Make sure learners are properly configured ``` Closes tarantool#10842 NO_DOC=see tarantool/doc#4993
1 parent def4456 commit cd82d64

File tree

4 files changed

+152
-14
lines changed

4 files changed

+152
-14
lines changed
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
## feature/config
2+
3+
* Now Tarantool instances configured in the supervised failover mode don't
4+
choose learner instances as a bootstrap leader (gh-10842).

src/box/lua/config/applier/box_cfg.lua

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,8 @@ end
158158

159159
-- {{{ Set RO/RW
160160

161-
local function set_ro_rw(configdata, box_cfg)
161+
local function set_ro_rw(config, box_cfg)
162+
local configdata = config._configdata
162163
-- The startup process may need a special handling and differs
163164
-- from the configuration reloading process.
164165
local is_startup = type(box.cfg) == 'function'
@@ -258,6 +259,13 @@ local function set_ro_rw(configdata, box_cfg)
258259
assert(false)
259260
end
260261
elseif failover == 'supervised' then
262+
if configdata:bootstrap_leader_name() == nil then
263+
local warning = 'box_cfg.apply: cannot determine a bootstrap ' ..
264+
'leader based on the configuration. Make sure learners are ' ..
265+
'properly configured'
266+
config._aboard:set({type = 'warn', message = warning})
267+
end
268+
261269
-- The startup flow in the 'supervised' failover mode is
262270
-- the following.
263271
--
@@ -1058,7 +1066,7 @@ local function apply(config)
10581066
set_replication_peers(configdata, box_cfg)
10591067
set_log(configdata, box_cfg)
10601068
set_audit_log(configdata, box_cfg)
1061-
set_ro_rw(configdata, box_cfg)
1069+
set_ro_rw(config, box_cfg)
10621070
revert_non_dynamic_options(config, box_cfg)
10631071
set_names_in_background(config, box_cfg)
10641072
set_bootstrap_leader(configdata, box_cfg)

src/box/lua/config/configdata.lua

Lines changed: 25 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,19 @@ local instance_config = require('internal.config.instance_config')
1010
local cluster_config = require('internal.config.cluster_config')
1111
local snapshot = require('internal.config.utils.snapshot')
1212

13+
-- {{{ General-purpose utils
14+
15+
-- {'a', 'b', 'c'} => {a = true, b = true, c = true}
16+
local function array2set(t)
17+
local res = {}
18+
for _, v in ipairs(t) do
19+
res[v] = true
20+
end
21+
return res
22+
end
23+
24+
-- }}} General-purpose utils
25+
1326
local function choose_iconfig(self, opts)
1427
if opts ~= nil and opts.instance ~= nil then
1528
local instances = self._instances
@@ -1044,28 +1057,28 @@ local function new(iconfig, cconfig, instance_name)
10441057
end
10451058
assert(bootstrap_leader == nil)
10461059

1047-
-- Choose the first non-anonymous instance with the highest
1048-
-- priority specified in the failover configuration
1049-
-- section.
1060+
local failover_replicaset = instance_config:get(iconfig_def,
1061+
{'failover', 'replicasets', found.replicaset_name}) or {}
1062+
1063+
local failover_learners = array2set(failover_replicaset.learners or {})
1064+
local failover_priorities = failover_replicaset.priority or {}
1065+
1066+
-- Choose the first non-anonymous non-learner instance
1067+
-- with the highest priority specified in the failover
1068+
-- configuration section.
10501069
local max_priority = -math.huge
10511070
for _, peer_name in ipairs(peer_names) do
10521071
assert(peers[peer_name] ~= nil)
10531072
local iconfig_def = peers[peer_name].iconfig_def
10541073
local is_anon = instance_config:get(iconfig_def, 'replication.anon')
1074+
local is_learner = failover_learners[peer_name]
1075+
local priority = failover_priorities[peer_name] or 0
10551076

1056-
local priority = instance_config:get(iconfig_def, {
1057-
'failover',
1058-
'replicasets',
1059-
found.replicaset_name,
1060-
'priority',
1061-
peer_name}) or 0
1062-
1063-
if not is_anon and priority > max_priority then
1077+
if not is_anon and not is_learner and priority > max_priority then
10641078
bootstrap_leader_name = peer_name
10651079
max_priority = priority
10661080
end
10671081
end
1068-
assert(bootstrap_leader_name ~= nil)
10691082
end
10701083

10711084
-- Names and UUIDs are always validated: during instance start
Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
local t = require('luatest')
2+
local cbuilder = require('luatest.cbuilder')
3+
local cluster = require('luatest.cluster')
4+
5+
local g = t.group()
6+
7+
-- {{{ Helpers
8+
9+
local function check_instance_mode(instance, mode)
10+
t.assert_equals(instance:eval('return box.info.ro'), mode == 'ro')
11+
end
12+
13+
local function find_alert(server, prefix)
14+
return server:exec(function(prefix)
15+
for _, alert in ipairs(box.info.config.alerts) do
16+
if alert.message:startswith(prefix) then
17+
return alert
18+
end
19+
end
20+
return nil
21+
end, {prefix})
22+
end
23+
24+
-- }}} Helpers
25+
26+
-- Verify that using `replication.failover = "supervised"` with
27+
-- `replication.bootstrap_strategy = "auto"` not chooses failover
28+
-- learners as a bootstrap leader.
29+
g.test_not_chooses_learners_as_bootstrap_leader = function()
30+
local config = cbuilder:new()
31+
:set_global_option('failover.replicasets.r-001.learners',
32+
{'i-001', 'i-002'})
33+
:set_global_option('replication.failover', 'supervised')
34+
:use_replicaset('r-001')
35+
:add_instance('i-001', {})
36+
:add_instance('i-002', {})
37+
:add_instance('i-003', {})
38+
:config()
39+
40+
local cluster = cluster:new(config)
41+
cluster:start()
42+
43+
check_instance_mode(cluster['i-001'], 'ro')
44+
check_instance_mode(cluster['i-002'], 'ro')
45+
check_instance_mode(cluster['i-003'], 'rw')
46+
end
47+
48+
-- Check a singleton replicaset bootstrap fails if the instance
49+
-- is marked as a learner.
50+
g.test_all_learners = function()
51+
local config = cbuilder:new()
52+
:set_global_option('failover.replicasets.r-001.learners',
53+
{'i-001'})
54+
:set_global_option('replication.failover', 'supervised')
55+
:use_replicaset('r-001')
56+
:add_instance('i-001', {})
57+
:config()
58+
59+
local cluster = cluster:new(config)
60+
cluster:start({wait_until_ready = false})
61+
62+
-- The instance (with the smallest UUID in the
63+
-- lexicographical order) chooses itself as a join bootstrap
64+
-- leader and fails since it's unable to bootstrap itself in
65+
-- RO mode.
66+
--
67+
-- If there are other instances within the replicaset the
68+
-- behavior is unpredictable. They might connect to the
69+
-- leader before it fails. If so they will wait for it.
70+
-- Otherwise they will fail.
71+
t.helpers.retrying({timeout = 10}, function()
72+
t.assert_not(cluster['i-001'].process:is_alive())
73+
end)
74+
75+
local config = cbuilder:new(config)
76+
:set_global_option('failover.replicasets.r-001.learners', {})
77+
:config()
78+
cluster:sync(config)
79+
80+
-- Now, the cluster should be able to bee successfully
81+
-- bootstrapped.
82+
cluster:start()
83+
84+
check_instance_mode(cluster['i-001'], 'rw')
85+
end
86+
87+
-- Check an alert is issued if all instances are marked as
88+
-- learners.
89+
g.test_alert_on_no_leader = function()
90+
local config1 = cbuilder:new()
91+
:use_replicaset('r-001')
92+
:add_instance('i-001', {})
93+
:config()
94+
95+
local cluster = cluster:new(config1)
96+
cluster:start()
97+
98+
local config2 = cbuilder:new(config1)
99+
:set_global_option('failover.replicasets.r-001.learners',
100+
{'i-001'})
101+
:set_global_option('replication.failover', 'supervised')
102+
:config()
103+
cluster:reload(config2)
104+
105+
local msg = 'box_cfg.apply: cannot determine a bootstrap leader based ' ..
106+
'on the configuration'
107+
find_alert(cluster['i-001'], msg)
108+
109+
cluster:reload(config1)
110+
cluster['i-001']:exec(function()
111+
t.assert_equals(box.info.config.alerts, {})
112+
end)
113+
end

0 commit comments

Comments
 (0)