Skip to content

Commit 2d0842e

Browse files
committed
rebalancer: reduce spam in logs
Before this patch "Some buckets are not active ..." logs were printed every `REBALANCER_WORK_INTERVAL` seconds, flooding the log file with redundant messages. Also these records were lacking of information about the current replicaset. This patch fixes the issue by limiting this log frequency and adding information about replicaset's id to log messages. Part of #212 NO_DOC=bugfix
1 parent b507c32 commit 2d0842e

File tree

2 files changed

+58
-1
lines changed

2 files changed

+58
-1
lines changed

test/storage-luatest/storage_1_1_1_test.lua

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,31 @@ local function move_bucket(src_storage, dest_storage, bucket_id)
123123
end, {bucket_id})
124124
end
125125

126+
--
127+
-- The datetime in log should be strictly in format:
128+
-- YEAR-MONTH-DAY HOUR:MIN:SEC.MS
129+
--
130+
local function extract_datetime_from_log(str)
131+
return string.match(str, "(%d+-%d+-%d+) (%d+:%d+:%d+.%d+)")
132+
end
133+
134+
local function test_only_one_record_appears_in_logs(server, record, wait_time)
135+
local first_log_record = nil
136+
t.helpers.retrying({}, function()
137+
first_log_record = server:grep_log(record)
138+
t.assert(first_log_record)
139+
end)
140+
local first_log_time = extract_datetime_from_log(first_log_record)
141+
-- We need to wait a bit in order to catch how much as possible
142+
-- spam in server's logs.
143+
require("fiber").sleep(wait_time)
144+
local last_log_record = server:grep_log(record)
145+
t.assert(last_log_record)
146+
local last_log_time = extract_datetime_from_log(last_log_record)
147+
t.assert_equals(first_log_time, last_log_time,
148+
"There are two identical records in logs")
149+
end
150+
126151
rebalancer_recovery_group.before_all(function(g)
127152
global_cfg = vtest.config_new(cfg_template)
128153
vtest.cluster_new(g, global_cfg)
@@ -213,3 +238,26 @@ rebalancer_recovery_group.test_rebalancer_routes_logging = function(g)
213238
t.assert(g.replica_1_a:grep_log(route_1_to_2))
214239
t.assert(g.replica_1_a:grep_log(route_1_to_3))
215240
end
241+
242+
rebalancer_recovery_group.test_no_log_spam_when_buckets_no_active = function(g)
243+
g.replica_2_a:stop()
244+
g.replica_1_a:exec(function()
245+
rawset(_G, 'old_rebalancer_interval', ivconst.REBALANCER_WORK_INTERVAL)
246+
rawset(_G, 'old_rebalancer_timeout',
247+
ivconst.REBALANCER_GET_STATE_TIMEOUT)
248+
ivconst.REBALANCER_WORK_INTERVAL = 0.01
249+
ivconst.REBALANCER_GET_STATE_TIMEOUT = 0.01
250+
end)
251+
252+
t.helpers.retrying({timeout = 10}, function()
253+
t.assert(g.replica_1_a:grep_log(g.buckets_not_active_log))
254+
end)
255+
test_only_one_record_appears_in_logs(g.replica_1_a,
256+
g.buckets_not_active_pattern,
257+
g.rebalancer_wait_interval * 2)
258+
g.replica_1_a:exec(function()
259+
ivconst.REBALANCER_WORK_INTERVAL = _G.old_rebalancer_interval
260+
ivconst.REBALANCER_GET_STATE_TIMEOUT = _G.old_rebalancer_timeout
261+
end)
262+
g.replica_2_a:start()
263+
end

vshard/storage/init.lua

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2823,6 +2823,7 @@ end
28232823
--
28242824
local function rebalancer_service_f(service)
28252825
local module_version = M.module_version
2826+
local some_buckets_are_not_active = false
28262827
while module_version == M.module_version do
28272828
service:next_iter()
28282829
while not M.is_rebalancer_active do
@@ -2844,7 +2845,13 @@ local function rebalancer_service_f(service)
28442845
'Error during downloading rebalancer states: %s',
28452846
replicasets))
28462847
end
2847-
log.info('Some buckets are not active, retry rebalancing later')
2848+
if not some_buckets_are_not_active then
2849+
log.info('Some buckets are not active, replicaset ' ..
2850+
string.format('%s will retry', M.this_replicaset.id) ..
2851+
string.format(' rebalancing every %s s.',
2852+
consts.REBALANCER_WORK_INTERVAL))
2853+
some_buckets_are_not_active = true
2854+
end
28482855
service:set_activity('idling')
28492856
lfiber.testcancel()
28502857
lfiber.sleep(consts.REBALANCER_WORK_INTERVAL)
@@ -2874,6 +2881,7 @@ local function rebalancer_service_f(service)
28742881
service:set_activity('idling')
28752882
lfiber.testcancel()
28762883
lfiber.sleep(consts.REBALANCER_IDLE_INTERVAL)
2884+
some_buckets_are_not_active = false
28772885
goto continue
28782886
end
28792887
local routes = rebalancer_build_routes(replicasets)
@@ -2901,6 +2909,7 @@ local function rebalancer_service_f(service)
29012909
goto continue
29022910
end
29032911
end
2912+
some_buckets_are_not_active = false
29042913
log.info('Rebalance routes are sent. Schedule next wakeup after '..
29052914
'%f seconds', consts.REBALANCER_WORK_INTERVAL)
29062915
service:set_activity('idling')

0 commit comments

Comments
 (0)