Skip to content

Commit 9995f38

Browse files
feat: passive round-robin health check (#148)
Co-authored-by: tzssangglass <[email protected]>
1 parent 9563e45 commit 9995f38

File tree

6 files changed

+271
-87
lines changed

6 files changed

+271
-87
lines changed

health_check.md

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,12 +32,14 @@ Get the current status of the target.
3232

3333
| name | Type | Requirement | Default | Description |
3434
| ------------ | ------- | ----------- | ------- | ------------------------------------------------------------ |
35-
| shm_name | string | required | | the declarative `lua_shared_dict` is used to store the health status of endpoints. |
35+
| shm_name | string | optional | | the declarative `lua_shared_dict` is used to store the health status of endpoints, if this option is not set, the health check will return to [round-robin](#round-robin-based-health-check) check mode. |
3636
| fail_timeout | integer | optional | 10s | sets the time during which the specified number of unsuccessful attempts to communicate with the endpoint should happen to marker the endpoint unavailable, and also sets the period of time the endpoint will be marked unavailable. |
37-
| max_fails | integer | optional | 1 | sets the number of failed attempts that must occur during the `fail_timeout` period for the endpoint to be marked unavailable. |
37+
| max_fails | integer | optional | 1 | sets the number of failed attempts that must occur during the `fail_timeout` period for the endpoint to be marked unavailable. This configuration only takes effect in [policy](#policy-based-health-check) check mode |
3838
| retry | bool | optional | false | automatically retry another endpoint when operations failed. |
3939

40-
lua example:
40+
## Example
41+
42+
### Policy based health check
4143

4244
```lua
4345
local health_check, err = require("resty.etcd.health_check").init({
@@ -54,19 +56,36 @@ Health check mechanism would switch endpoint only when the previously choosed en
5456

5557
The failure counter and health status of each etcd endpoint are shared across workers and by different etcd clients.
5658

57-
Also note that the `fail_timeout`, `max_fails` and `retry` of the health check cannot be changed once it has been created.
59+
PS: the `fail_timeout`, `max_fails` and `retry` of the health check After initialization, they will only be reset when the health check mode is switched.
60+
61+
### Round-robin based health check
62+
63+
```lua
64+
local health_check, err = require("resty.etcd.health_check").init({
65+
fail_timeout = 10,
66+
retry = false,
67+
})
68+
```
69+
70+
Round-robin health check. When a endpoint fails, the endpoint will be marked as unhealthy, and will not be connected to the endpoint within the time set by `fail_timeout` (select the next healthy endpoint to connect).
71+
72+
Unhealthy nodes will be released to the selection pool of healthy endpoints after the `fail_timeout` time is exceeded.
73+
74+
The status of etcd unhealthy endpoints is only valid in the current worker
5875

5976
## Synopsis
6077

6178
```nginx
6279
http {
6380
# required declares a shared memory zone to store endpoints's health status
81+
# if you use the round-robin method for health check, you don’t need to set this
6482
lua_shared_dict healthcheck_shm 1m;
6583
6684
server {
6785
location = /healthcheck {
6886
content_by_lua_block {
6987
# the health check feature is optional, and can be enabled with the following configuration.
88+
# if you use the round-robin method for health check, you don’t need to set this
7089
local health_check, err = require("resty.etcd.health_check").init({
7190
shm_name = "healthcheck_shm",
7291
fail_timeout = 10,

lib/resty/etcd/health_check.lua

Lines changed: 71 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,31 +1,53 @@
11
local ngx_shared = ngx.shared
22
local utils = require("resty.etcd.utils")
3+
local type = type
4+
local now = os.time
35
local conf
46

57
local _M = {}
68

9+
local round_robin_unhealthy_target_hosts
10+
11+
712
local function gen_unhealthy_key(etcd_host)
813
return "unhealthy-" .. etcd_host
914
end
1015

1116
local function get_target_status(etcd_host)
12-
if conf == nil then
13-
return
17+
if not conf then
18+
return nil, "etcd health check uninitialized"
1419
end
1520

16-
local unhealthy_key = gen_unhealthy_key(etcd_host)
17-
local unhealthy_endpoint, err = ngx_shared[conf.shm_name]:get(unhealthy_key)
18-
if err then
19-
utils.log_warn("failed to get unhealthy_key: ",
20-
unhealthy_key, " err: ", err)
21-
return
21+
if type(etcd_host) ~= "string" then
22+
return false, "etcd host invalid"
2223
end
2324

24-
if not unhealthy_endpoint then
25-
return true
26-
end
25+
local unhealthy_key = gen_unhealthy_key(etcd_host)
26+
if conf.shm_name ~= nil then
27+
local unhealthy_endpoint, err = ngx_shared[conf.shm_name]:get(unhealthy_key)
28+
if err then
29+
utils.log_warn("failed to get unhealthy_key: ",
30+
unhealthy_key, " err: ", err)
31+
return
32+
end
2733

28-
return false
34+
if not unhealthy_endpoint then
35+
return true
36+
end
37+
38+
return false
39+
else
40+
if type(round_robin_unhealthy_target_hosts) ~= "table" then
41+
round_robin_unhealthy_target_hosts = {}
42+
end
43+
44+
local target_fail_expired_time = round_robin_unhealthy_target_hosts[unhealthy_key]
45+
if target_fail_expired_time and target_fail_expired_time >= now() then
46+
return false, "endpoint: " .. etcd_host .. " is unhealthy"
47+
else
48+
return true
49+
end
50+
end
2951
end
3052
_M.get_target_status = get_target_status
3153

@@ -44,37 +66,56 @@ end
4466

4567

4668
local function report_failure(etcd_host)
47-
if conf == nil then
48-
return
69+
if not conf then
70+
return nil, "etcd health check uninitialized"
4971
end
5072

51-
local fails, err = fault_count(etcd_host, conf.shm_name, conf.fail_timeout)
52-
if err then
53-
utils.log_error("failed to incr etcd endpoint fail times: ", err)
54-
return
73+
if type(etcd_host) ~= "string" then
74+
return nil, "etcd host invalid"
5575
end
5676

57-
if fails >= conf.max_fails then
58-
local unhealthy_key = gen_unhealthy_key(etcd_host)
59-
local unhealthy_endpoint, _ = ngx_shared[conf.shm_name]:get(unhealthy_key)
60-
if unhealthy_endpoint == nil then
61-
ngx_shared[conf.shm_name]:set(unhealthy_key, etcd_host,
62-
conf.fail_timeout)
63-
utils.log_warn("update endpoint: ", etcd_host, " to unhealthy")
77+
if conf.shm_name ~= nil then
78+
local fails, err = fault_count(etcd_host, conf.shm_name, conf.fail_timeout)
79+
if err then
80+
utils.log_error("failed to incr etcd endpoint fail times: ", err)
81+
return nil, err
6482
end
83+
84+
if fails >= conf.max_fails then
85+
local unhealthy_key = gen_unhealthy_key(etcd_host)
86+
local unhealthy_endpoint, _ = ngx_shared[conf.shm_name]:get(unhealthy_key)
87+
if unhealthy_endpoint == nil then
88+
ngx_shared[conf.shm_name]:set(unhealthy_key, etcd_host,
89+
conf.fail_timeout)
90+
utils.log_warn("update endpoint: ", etcd_host, " to unhealthy")
91+
end
92+
end
93+
else
94+
if type(round_robin_unhealthy_target_hosts) ~= "table" then
95+
round_robin_unhealthy_target_hosts = {}
96+
end
97+
local unhealthy_key = gen_unhealthy_key(etcd_host)
98+
round_robin_unhealthy_target_hosts[unhealthy_key] = now() + conf.fail_timeout
99+
utils.log_warn("update endpoint: ", etcd_host, " to unhealthy")
65100
end
66101
end
67102
_M.report_failure = report_failure
68103

69104

70105
function _M.init(opts)
71-
if conf == nil then
106+
opts = opts or {}
107+
if not conf or opts.shm_name ~= conf.shm_name then
72108
conf = {}
73-
local shared_dict = ngx_shared[opts.shm_name]
74-
if not shared_dict then
75-
return nil, "failed to get ngx.shared dict: " .. opts.shm_name
109+
if opts.shm_name and type(opts.shm_name) == "string" then
110+
local shared_dict = ngx_shared[opts.shm_name]
111+
if not shared_dict then
112+
return nil, "failed to get ngx.shared dict: " .. opts.shm_name
113+
end
114+
conf.shm_name = opts.shm_name
115+
utils.log_info("healthy check use ngx.shared dict: ", opts.shm_name)
116+
else
117+
utils.log_info("healthy check use round robin")
76118
end
77-
conf.shm_name = opts.shm_name
78119
conf.fail_timeout = opts.fail_timeout or 10 -- 10 sec
79120
conf.max_fails = opts.max_fails or 1
80121
conf.retry = opts.retry or false

lib/resty/etcd/v3.lua

Lines changed: 27 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@ local encode_json = cjson.encode
2323
local encode_base64 = ngx.encode_base64
2424
local decode_base64 = ngx.decode_base64
2525
local semaphore = require("ngx.semaphore")
26-
local INIT_COUNT_RESIZE = 2e8
2726
local health_check = require("resty.etcd.health_check")
2827

2928
local _M = {}
@@ -45,24 +44,14 @@ local refresh_jwt_token
4544

4645
local function choose_endpoint(self)
4746
local endpoints = self.endpoints
48-
local endpoints_len = #endpoints
4947

50-
if health_check.conf ~= nil then
51-
for _, endpoint in ipairs(endpoints) do
52-
if health_check.get_target_status(endpoint.http_host) then
53-
return endpoint
54-
end
48+
for _, endpoint in ipairs(endpoints) do
49+
if health_check.get_target_status(endpoint.http_host) then
50+
return endpoint
5551
end
56-
return nil, "has no healthy etcd endpoint available"
57-
end
58-
59-
self.init_count = (self.init_count or -1) + 1
60-
local pos = self.init_count % endpoints_len + 1
61-
if self.init_count >= INIT_COUNT_RESIZE then
62-
self.init_count = 0
6352
end
6453

65-
return endpoints[pos]
54+
return nil, "has no healthy etcd endpoint available"
6655
end
6756

6857

@@ -92,17 +81,12 @@ local function http_request_uri(self, http_cli, method, uri, body, headers, keep
9281
})
9382

9483
if err then
95-
if health_check.conf ~= nil then
96-
health_check.report_failure(endpoint.http_host)
97-
err = endpoint.http_host .. ": " .. err
98-
end
99-
return nil, err
84+
health_check.report_failure(endpoint.http_host)
85+
return nil, endpoint.http_host .. ": " .. err
10086
end
10187

10288
if res.status >= 500 then
103-
if health_check.conf ~= nil then
104-
health_check.report_failure(endpoint.http_host)
105-
end
89+
health_check.report_failure(endpoint.http_host)
10690
return nil, "invalid response code: " .. res.status
10791
end
10892

@@ -156,12 +140,7 @@ local function _request_uri(self, method, uri, opts, timeout, ignore_auth)
156140
end
157141

158142
local res
159-
if health_check.conf == nil or not health_check.conf.retry then
160-
res, err = http_request_uri(self, http_cli, method, uri, body, headers, keepalive)
161-
if err then
162-
return nil, err
163-
end
164-
else
143+
if health_check.conf.retry then
165144
local max_retry = #self.endpoints * health_check.conf.max_fails + 1
166145
for _ = 1, max_retry do
167146
res, err = http_request_uri(self, http_cli, method, uri, body, headers, keepalive)
@@ -174,6 +153,11 @@ local function _request_uri(self, method, uri, opts, timeout, ignore_auth)
174153
break
175154
end
176155
end
156+
else
157+
res, err = http_request_uri(self, http_cli, method, uri, body, headers, keepalive)
158+
if err then
159+
return nil, err
160+
end
177161
end
178162

179163
if not typeof.string(res.body) then
@@ -269,6 +253,10 @@ function _M.new(opts)
269253
return nil, err
270254
end
271255

256+
if health_check.conf == nil then
257+
health_check.init()
258+
end
259+
272260
return setmetatable({
273261
last_auth_time = now(), -- save last Authentication time
274262
last_refresh_jwt_err = nil,
@@ -596,9 +584,7 @@ local function http_request_chunk(self, http_cli)
596584
ssl_server_name = self.sni,
597585
})
598586
if not ok then
599-
if health_check.conf ~= nil then
600-
health_check.report_failure(endpoint.http_host)
601-
end
587+
health_check.report_failure(endpoint.http_host)
602588
return nil, endpoint.http_host .. ": " .. err
603589
end
604590

@@ -653,12 +639,7 @@ local function request_chunk(self, method, path, opts, timeout)
653639
end
654640

655641
local endpoint
656-
if health_check.conf == nil or not health_check.conf.retry then
657-
endpoint, err = http_request_chunk(self, http_cli)
658-
if err then
659-
return nil, err
660-
end
661-
else
642+
if health_check.conf.retry then
662643
local max_retry = #self.endpoints * health_check.conf.max_fails + 1
663644
for _ = 1, max_retry do
664645
endpoint, err = http_request_chunk(self, http_cli)
@@ -671,6 +652,11 @@ local function request_chunk(self, method, path, opts, timeout)
671652
break
672653
end
673654
end
655+
else
656+
endpoint, err = http_request_chunk(self, http_cli)
657+
if err then
658+
return nil, err
659+
end
674660
end
675661

676662
local res
@@ -710,11 +696,9 @@ local function request_chunk(self, method, path, opts, timeout)
710696
if not body then
711697
return nil, "failed to decode json body: " .. (err or " unkwon")
712698
elseif body.error and body.error.http_code >= 500 then
713-
if health_check.conf ~= nil then
714-
-- health_check retry should do nothing here
715-
-- and let connection closed to create a new one
716-
health_check.report_failure(endpoint.http_host)
717-
end
699+
-- health_check retry should do nothing here
700+
-- and let connection closed to create a new one
701+
health_check.report_failure(endpoint.http_host)
718702
return nil, endpoint.http_host .. ": " .. body.error.http_status
719703
end
720704

t/v3/auth.t

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -200,17 +200,17 @@ GET /t
200200
--- response_body
201201
ok
202202
--- grep_error_log eval
203-
qr/(uri: .+, timeout: \d+|v3 refresh jwt last err: [^,]+|connection refused)/
203+
qr/(uri: .+, timeout: \d+|has no healthy [^,]+)/
204204
--- grep_error_log_out
205205
uri: /kv/put, timeout: 3
206206
uri: /auth/authenticate, timeout: 3
207+
has no healthy etcd endpoint available
207208
uri: /kv/put, timeout: 3
209+
uri: /auth/authenticate, timeout: 3
210+
has no healthy etcd endpoint available
208211
uri: /kv/put, timeout: 3
209-
connection refused
210-
v3 refresh jwt last err: connection refused
211-
connection refused
212-
v3 refresh jwt last err: connection refused
213-
connection refused
212+
uri: /auth/authenticate, timeout: 3
213+
has no healthy etcd endpoint available
214214
215215
216216

0 commit comments

Comments
 (0)