Skip to content

Commit 1fe01f7

Browse files
authored
feat: support healthcheck when connect to etcd cluster nodes (#109)
fix #101 fix #55
1 parent f2abd8b commit 1fe01f7

File tree

6 files changed

+688
-31
lines changed

6 files changed

+688
-31
lines changed

health_check.md

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
# Etcd Cluster Health Check
2+
3+
## Description
4+
5+
Implement a passive health check mechanism, that when the connection/read/write fails, record it as an endpoint's failure.
6+
7+
## Methods
8+
9+
* [init](#init)
10+
* [report_failure](#report_failure)
11+
* [get_target_status](#get_target_status)
12+
13+
### init
14+
15+
`syntax: health_check, err = health_check.init(params)`
16+
17+
Initializes the health check object, overiding default params with the given ones. In case of failures, returns `nil` and a string describing the error.
18+
19+
### report_failure
20+
21+
`syntax: health_check.report_failure(etcd_host)`
22+
23+
Reports a health failure which will count against the number of occurrences required to make a target "fail".
24+
25+
### get_target_status
26+
27+
`syntax: healthy, err = health_check.get_target_status(etcd_host)`
28+
29+
Get the current status of the target.
30+
31+
## Config
32+
33+
| name | Type | Requirement | Default | Description |
34+
| ------------ | ------- | ----------- | ------- | ------------------------------------------------------------ |
35+
| shm_name | string | required | | the declarative `lua_shared_dict` is used to store the health status of endpoints. |
36+
| fail_timeout | integer | optional | 10s | sets the time during which the specified number of unsuccessful attempts to communicate with the endpoint should happen to marker the endpoint unavailable, and also sets the period of time the endpoint will be marked unavailable. |
37+
| max_fails | integer | optional | 1 | sets the number of failed attempts that must occur during the `fail_timeout` period for the endpoint to be marked unavailable. |
38+
39+
lua example:
40+
41+
```lua
42+
local health_check, err = require("resty.etcd.health_check").init({
43+
shm_name = "healthcheck_shm",
44+
fail_timeout = 10,
45+
max_fails = 1,
46+
})
47+
```
48+
49+
In a `fail_timeout`, if there are `max_fails` consecutive failures, the endpoint is marked as unhealthy, the unhealthy endpoint will not be choosed to connect for a `fail_timeout` time in the future.
50+
51+
Health check mechanism would switch endpoint only when the previously choosed endpoint is marked as unhealthy.
52+
53+
The failure counter and health status of each etcd endpoint are shared across workers and by different etcd clients.
54+
55+
Also note that the `fail_timeout` and `max_fails` of the health check cannot be changed once it has been created.
56+
57+
## Synopsis
58+
59+
```nginx
60+
http {
61+
# required declares a shared memory zone to store endpoints's health status
62+
lua_shared_dict healthcheck_shm 1m;
63+
64+
server {
65+
location = /healthcheck {
66+
content_by_lua_block {
67+
# the health check feature is optional, and can be enabled with the following configuration.
68+
local health_check, err = require("resty.etcd.health_check").init({
69+
shm_name = "healthcheck_shm",
70+
fail_timeout = 10,
71+
max_fails = 1,
72+
})
73+
74+
local etcd, err = require("resty.etcd").new({
75+
protocol = "v3",
76+
http_host = {
77+
"http://127.0.0.1:12379",
78+
"http://127.0.0.1:22379",
79+
"http://127.0.0.1:32379",
80+
},
81+
user = 'root',
82+
password = 'abc123',
83+
})
84+
}
85+
}
86+
}
87+
}
88+
```

lib/resty/etcd/health_check.lua

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
local ngx_shared = ngx.shared
2+
local utils = require("resty.etcd.utils")
3+
local conf
4+
5+
local _M = {}
6+
7+
local function gen_unhealthy_key(etcd_host)
8+
return "unhealthy-" .. etcd_host
9+
end
10+
11+
local function get_target_status(etcd_host)
12+
if conf == nil then
13+
return
14+
end
15+
16+
local unhealthy_key = gen_unhealthy_key(etcd_host)
17+
local unhealthy_endpoint, err = ngx_shared[conf.shm_name]:get(unhealthy_key)
18+
if err then
19+
utils.log_warn("failed to get unhealthy_key: ",
20+
unhealthy_key, " err: ", err)
21+
return
22+
end
23+
24+
if not unhealthy_endpoint then
25+
return true
26+
end
27+
28+
return false
29+
end
30+
_M.get_target_status = get_target_status
31+
32+
33+
local function fault_count(key, shm_name, fail_timeout)
34+
local new_value, err, forcible = ngx_shared[shm_name]:incr(key, 1, 0, fail_timeout)
35+
if err then
36+
return nil, err
37+
end
38+
39+
if forcible then
40+
utils.log_warn("shared dict: ", shm_name, " is full, valid items forcibly overwritten")
41+
end
42+
return new_value, nil
43+
end
44+
45+
46+
local function report_failure(etcd_host)
47+
if conf == nil then
48+
return
49+
end
50+
51+
local fails, err = fault_count(etcd_host, conf.shm_name, conf.fail_timeout)
52+
if err then
53+
utils.log_error("failed to incr etcd endpoint fail times: ", err)
54+
return
55+
end
56+
57+
if fails >= conf.max_fails then
58+
local unhealthy_key = gen_unhealthy_key(etcd_host)
59+
local unhealthy_endpoint, _ = ngx_shared[conf.shm_name]:get(unhealthy_key)
60+
if unhealthy_endpoint == nil then
61+
ngx_shared[conf.shm_name]:set(unhealthy_key, etcd_host,
62+
conf.fail_timeout)
63+
utils.log_warn("update endpoint: ", etcd_host, " to unhealthy")
64+
end
65+
end
66+
end
67+
_M.report_failure = report_failure
68+
69+
70+
function _M.init(opts)
71+
if conf == nil then
72+
conf = {}
73+
local shared_dict = ngx_shared[opts.shm_name]
74+
if not shared_dict then
75+
return nil, "failed to get ngx.shared dict: " .. opts.shm_name
76+
end
77+
conf.shm_name = opts.shm_name
78+
conf.fail_timeout = opts.fail_timeout or 10 -- 10 sec
79+
conf.max_fails = opts.max_fails or 1
80+
_M.conf = conf
81+
return _M, nil
82+
end
83+
end
84+
85+
return _M

lib/resty/etcd/utils.lua

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@ end
8484
local ngx_log = ngx.log
8585
local ngx_ERR = ngx.ERR
8686
local ngx_INFO = ngx.INFO
87+
local ngx_WARN = ngx.WARN
8788
local function log_error(...)
8889
return ngx_log(ngx_ERR, ...)
8990
end
@@ -95,6 +96,13 @@ local function log_info( ... )
9596
end
9697
_M.log_info = log_info
9798

99+
100+
local function log_warn( ... )
101+
return ngx_log(ngx_WARN, ...)
102+
end
103+
_M.log_warn = log_warn
104+
105+
98106
local function verify_key(key)
99107
if not key or #key == 0 then
100108
return false, "key should not be empty"

0 commit comments

Comments
 (0)