api7
diff --git a/‎health_check.md‎
Lines changed: 88 additions & 0 deletions b/‎health_check.md‎
Lines changed: 88 additions & 0 deletions
diff --git a/‎lib/resty/etcd/health_check.lua‎
Lines changed: 85 additions & 0 deletions b/‎lib/resty/etcd/health_check.lua‎
Lines changed: 85 additions & 0 deletions
diff --git a/‎lib/resty/etcd/utils.lua‎
Lines changed: 8 additions & 0 deletions b/‎lib/resty/etcd/utils.lua‎
Lines changed: 8 additions & 0 deletions
@@ -0,0 +1,88 @@
+# Etcd Cluster Health Check
+
+## Description
+
+Implement a passive health check mechanism, that when the connection/read/write fails, record it as an endpoint's failure.
+
+## Methods
+
+* [init](#init)
+* [report_failure](#report_failure)
+* [get_target_status](#get_target_status)
+
+###  init
+
+`syntax: health_check, err = health_check.init(params)`
+
+Initializes the health check object, overiding default params with the given ones. In case of failures, returns `nil` and a string describing the error.
+
+###  report_failure
+
+`syntax: health_check.report_failure(etcd_host)`
+
+Reports a health failure which will count against the number of occurrences required to make a target "fail". 
+
+###  get_target_status
+
+`syntax: healthy, err = health_check.get_target_status(etcd_host)`
+
+Get the current status of the target.
+
+## Config
+
+| name         | Type    | Requirement | Default | Description                                                  |
+| ------------ | ------- | ----------- | ------- | ------------------------------------------------------------ |
+| shm_name     | string  | required    |         | the declarative `lua_shared_dict` is used to store the health status of endpoints. |
+| fail_timeout | integer | optional    | 10s     | sets the time during which the specified number of unsuccessful attempts to communicate with the endpoint should happen to marker the endpoint unavailable, and also sets the period of time the endpoint will be marked unavailable. |
+| max_fails    | integer | optional    | 1       | sets the number of failed attempts that must occur during the `fail_timeout` period for the endpoint to be marked unavailable. |
+
+lua example:
+
+```lua
+local health_check, err = require("resty.etcd.health_check").init({
+    shm_name = "healthcheck_shm",
+    fail_timeout = 10,
+    max_fails = 1,
+})
+```
+
+In a `fail_timeout`, if there are `max_fails` consecutive failures, the endpoint is marked as unhealthy,  the unhealthy endpoint will not be choosed to connect for a `fail_timeout` time in the future. 
+
+Health check mechanism would switch endpoint only when the previously choosed endpoint is marked as unhealthy.
+
+The failure counter and health status of each etcd endpoint are shared across workers and by different etcd clients.
+
+Also note that the `fail_timeout` and `max_fails` of the health check cannot be changed once it has been created.
+
+##  Synopsis
+
+```nginx
+http {
+    # required declares a shared memory zone to store endpoints's health status
+    lua_shared_dict healthcheck_shm 1m;
+
+    server {
+        location = /healthcheck {
+            content_by_lua_block {
+                # the health check feature is optional, and can be enabled with the following configuration.
+                local health_check, err = require("resty.etcd.health_check").init({
+                    shm_name = "healthcheck_shm",
+                    fail_timeout = 10,
+                    max_fails = 1,
+                })
+
+                local etcd, err = require("resty.etcd").new({
+                    protocol = "v3",
+                    http_host = {
+                        "http://127.0.0.1:12379", 
+                        "http://127.0.0.1:22379",
+                        "http://127.0.0.1:32379",
+                    },
+                    user = 'root',
+                    password = 'abc123',
+                })
+            }
+        }
+    }
+}
+```
@@ -0,0 +1,85 @@
+local ngx_shared    = ngx.shared
+local utils         = require("resty.etcd.utils")
+local conf
+
+local _M = {}
+
+local function gen_unhealthy_key(etcd_host)
+    return "unhealthy-" .. etcd_host
+end
+
+local function get_target_status(etcd_host)
+    if conf == nil then
+        return
+    end
+
+    local unhealthy_key = gen_unhealthy_key(etcd_host)
+    local unhealthy_endpoint, err = ngx_shared[conf.shm_name]:get(unhealthy_key)
+    if err then
+        utils.log_warn("failed to get unhealthy_key: ",
+                unhealthy_key, " err: ", err)
+        return
+    end
+
+    if not unhealthy_endpoint then
+        return true
+    end
+
+    return false
+end
+_M.get_target_status = get_target_status
+
+
+local function fault_count(key, shm_name, fail_timeout)
+    local new_value, err, forcible = ngx_shared[shm_name]:incr(key, 1, 0, fail_timeout)
+    if err then
+        return nil, err
+    end
+
+    if forcible then
+        utils.log_warn("shared dict: ", shm_name, " is full, valid items forcibly overwritten")
+    end
+    return new_value, nil
+end
+
+
+local function report_failure(etcd_host)
+    if conf == nil then
+        return
+    end
+
+    local fails, err = fault_count(etcd_host, conf.shm_name, conf.fail_timeout)
+    if err then
+        utils.log_error("failed to incr etcd endpoint fail times: ", err)
+        return
+    end
+
+    if fails >= conf.max_fails then
+        local unhealthy_key = gen_unhealthy_key(etcd_host)
+        local unhealthy_endpoint, _ = ngx_shared[conf.shm_name]:get(unhealthy_key)
+        if unhealthy_endpoint == nil then
+            ngx_shared[conf.shm_name]:set(unhealthy_key, etcd_host,
+                    conf.fail_timeout)
+            utils.log_warn("update endpoint: ", etcd_host, " to unhealthy")
+        end
+    end
+end
+_M.report_failure = report_failure
+
+
+function _M.init(opts)
+    if conf == nil then
+        conf = {}
+        local shared_dict = ngx_shared[opts.shm_name]
+        if not shared_dict then
+            return nil, "failed to get ngx.shared dict: " .. opts.shm_name
+        end
+        conf.shm_name = opts.shm_name
+        conf.fail_timeout = opts.fail_timeout or 10    -- 10 sec
+        conf.max_fails = opts.max_fails or 1
+        _M.conf = conf
+        return _M, nil
+    end
+end
+
+return _M
@@ -84,6 +84,7 @@ end
 local ngx_log = ngx.log
 local ngx_ERR = ngx.ERR
 local ngx_INFO = ngx.INFO
+local ngx_WARN = ngx.WARN
 local function log_error(...)
     return ngx_log(ngx_ERR, ...)
 end
@@ -95,6 +96,13 @@ local function log_info( ... )
 end
 _M.log_info = log_info
 
+
+local function log_warn( ... )
+    return ngx_log(ngx_WARN, ...)
+end
+_M.log_warn = log_warn
+
+
 local function verify_key(key)
     if not key or #key == 0 then
         return false, "key should not be empty"