Skip to content

Commit cc059af

Browse files
committed
safe/fast mode: add tnt_crud_router_cache_clear_ts metric to router
This metric can help user to monitor routers status in the cluster when switching it back to fast mode. Also changed metrics callbacks to prevent duplicate metrics if init function is called multiple times.
1 parent c431b68 commit cc059af

File tree

4 files changed

+103
-12
lines changed

4 files changed

+103
-12
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,13 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
77

88
## [Unreleased]
99

10+
### Added
11+
* Add `tnt_crud_router_cache_clear_ts` metric to router to help user correctly disable safe mode in cluster.
12+
1013
### Fixed
1114
* Return bucket_ref error as array in `crud.*_many` methods.
1215
* Move bucket_unref out of transaction.
16+
* Prevent duplicate metrics if init function is called multiple times.
1317

1418
## [1.7.2] - 28-01-26
1519

crud.lua

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,8 @@ crud.rebalance.router_cache_clear = rebalance.router_api.cache_clear
167167

168168
function crud.init_router()
169169
rawset(_G, 'crud', crud)
170+
171+
rebalance.init_router_metrics()
170172
end
171173

172174
function crud.stop_router()

crud/common/rebalance.lua

Lines changed: 39 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
local log = require('log')
2+
local fiber = require('fiber')
23
local trigger = require('internal.trigger')
34
local vshard_consts = require('vshard.consts')
45
local schema = require('crud.schema')
@@ -11,6 +12,12 @@ local rebalance = {
1112
safe_mode = false,
1213
-- Trigger is run with one argument: true if safe mode is enabled and false if disabled.
1314
on_safe_mode_toggle = trigger.new('_crud.safe_mode_toggle'),
15+
router_cache_clear_ts = 0,
16+
17+
_metrics = {
18+
safe_mode_enabled_gauge = nil,
19+
router_cache_clear_ts_gauge = nil,
20+
},
1421
}
1522

1623
local function safe_mode_bucket_trigger(_, new, space, op)
@@ -98,17 +105,7 @@ function rebalance.init()
98105
_safe_mode_disable()
99106
end
100107

101-
--- Rebalance related metrics
102-
if has_metrics_module then
103-
local safe_mode_enabled_gauge = metrics.gauge(
104-
'tnt_crud_storage_safe_mode_enabled',
105-
"is safe mode enabled on this storage instance"
106-
)
107-
108-
metrics.register_callback(function()
109-
safe_mode_enabled_gauge:set(rebalance.safe_mode_status() and 1 or 0)
110-
end)
111-
end
108+
rebalance.init_storage_metrics()
112109
end
113110

114111
function rebalance.safe_mode_status()
@@ -139,7 +136,37 @@ function rebalance.router_api.cache_clear()
139136
log.warn("Router is not initialized yet")
140137
return
141138
end
142-
return router:_route_map_clear()
139+
router:_route_map_clear()
140+
rebalance.router_cache_clear_ts = fiber.time()
141+
end
142+
143+
--- Rebalance related metrics
144+
function rebalance._metrics.storage_callback()
145+
if not rebalance._metrics.safe_mode_enabled_gauge then return end
146+
rebalance._metrics.safe_mode_enabled_gauge:set(rebalance.safe_mode_status() and 1 or 0)
147+
end
148+
149+
function rebalance.init_storage_metrics()
150+
if not has_metrics_module then return end
151+
rebalance._metrics.safe_mode_enabled_gauge = metrics.gauge(
152+
'tnt_crud_storage_safe_mode_enabled',
153+
"is safe mode enabled on this storage instance"
154+
)
155+
metrics.register_callback(rebalance._metrics.storage_callback)
156+
end
157+
158+
function rebalance._metrics.router_callback()
159+
if not rebalance._metrics.router_cache_clear_ts_gauge then return end
160+
rebalance._metrics.router_cache_clear_ts_gauge:set(rebalance.router_cache_clear_ts)
161+
end
162+
163+
function rebalance.init_router_metrics()
164+
if not has_metrics_module then return end
165+
rebalance._metrics.router_cache_clear_ts_gauge = metrics.gauge(
166+
'tnt_crud_router_cache_clear_ts',
167+
"when route cache was last cleared on this router instance"
168+
)
169+
metrics.register_callback(rebalance._metrics.router_callback)
143170
end
144171

145172
return rebalance

test/integration/metrics_test.lua

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
local fiber = require('fiber')
12
local helpers = require('test.helper')
23
local t = require('luatest')
34

@@ -83,3 +84,60 @@ pgroup.test_safe_mode_storage_metrics = function(g)
8384
end
8485
end)
8586
end
87+
88+
pgroup.test_router_cache_metrics = function(g)
89+
local has_metrics_module = require('metrics')
90+
t.skip_if(not has_metrics_module, 'No metrics module in current version')
91+
92+
-- Check router cache metric initial value on router
93+
local observed = g.router:eval("return require('metrics').collect({ invoke_callbacks = true })")
94+
local has_metric = false
95+
for _, m in pairs(observed) do
96+
if m.metric_name == 'tnt_crud_router_cache_clear_ts' then
97+
t.assert_equals(m.value, 0, 'Cache never cleared')
98+
has_metric = true
99+
break
100+
end
101+
end
102+
if not has_metric then
103+
t.fail('No tnt_crud_router_cache_clear_ts metric found on router')
104+
end
105+
106+
-- Check no router cache metric on storage
107+
helpers.call_on_storages(g.cluster, function(server)
108+
observed = server:eval("return require('metrics').collect({ invoke_callbacks = true })")
109+
for _, m in pairs(observed) do
110+
if m.metric_name == 'tnt_crud_router_cache_clear_ts' then
111+
t.fail('tnt_crud_router_cache_clear_ts metric found on storage')
112+
end
113+
end
114+
end)
115+
116+
-- Clear router cache
117+
local expected_ts = fiber.time()
118+
g.router:call("crud.rebalance.router_cache_clear")
119+
120+
-- Check router cache metric new value on router
121+
observed = g.router:eval("return require('metrics').collect({ invoke_callbacks = true })")
122+
has_metric = false
123+
for _, m in pairs(observed) do
124+
if m.metric_name == 'tnt_crud_router_cache_clear_ts' then
125+
t.assert_almost_equals(m.value, expected_ts, 5, 'Cache never cleared')
126+
has_metric = true
127+
break
128+
end
129+
end
130+
if not has_metric then
131+
t.fail('No tnt_crud_router_cache_clear_ts metric found on router')
132+
end
133+
134+
-- Check no router cache metric appeared on storage
135+
helpers.call_on_storages(g.cluster, function(server)
136+
observed = server:eval("return require('metrics').collect({ invoke_callbacks = true })")
137+
for _, m in pairs(observed) do
138+
if m.metric_name == 'tnt_crud_router_cache_clear_ts' then
139+
t.fail('tnt_crud_router_cache_clear_ts metric found on storage')
140+
end
141+
end
142+
end)
143+
end

0 commit comments

Comments
 (0)