connpool: reconnect to recent instances on error

georgiy-belyanin · Totktonada · commit fa0540b25544 · 2025-07-30T20:05:30.000+03:00
This patch introduces logic that makes Tarantool connpool automatically reconnect to recently accessed instances in case of errors. The main aim is to make the connection pool methods `connpool.call()` and `connpool.filter()` faster by making they behave as follows when they need a connection to multiple instances. * Run some of the mentioned connpool methods. * Apply static configuration filters and find instance candidates. * Check if there are recent connections to all of the candidates. - Yes, there are connections to all of them (both active and broken). In that case, try to use the active and available ones. If the connection is not available, we may guarantee that it has been also unavailable during past 3 seconds (hard-coded reconnect after interval). That means there is no need to wait and try to reconnect to it. - No, some instances have not been accessed yet. In that case, connect to all of the remaining in parallel and wait until the connect is established/failed. The key improvement is that we no longer wait if some of the candidate connections have failed and re-use the existing connections. This logic will be added in the follow-up patch. Part of tarantool#10330 NO_DOC=will be added later
diff --git a/changelogs/unreleased/connpool-reconnect-to-recent.md b/changelogs/unreleased/connpool-reconnect-to-recent.md
@@ -0,0 +1,5 @@
+## feature/connpool
+
+* The `experimental.connpool` methods now try to reconnect to recently accessed
+  instances when they become unavailable. Reconnect attempts happen after a
+  constant interval and are stopped if the instance is no longer needed.
diff --git a/src/box/lua/connpool.lua b/src/box/lua/connpool.lua
@@ -8,6 +8,10 @@ local netbox = require('net.box')
 local WATCHER_DELAY = 0.1
 local WATCHER_TIMEOUT = 10
 
+-- This option controls delay between reconnect attempts to
+-- recently needed instances which connections have failed.
+local RECONNECT_AFTER = 3
+
 -- {{{ Basic instance connection pool
 
 local pool_methods = {}
@@ -75,6 +79,86 @@ function pool_methods._unused_connection_watchdog_wake(self)
     self._unused_connection_watchdog_fiber = f
 end
 
+function pool_methods._failed_connection_watchdog_step(self)
+    local now = clock.monotonic()
+    local until_next_reconnect = math.huge
+    local instance_names_to_reconnect = {}
+
+    -- At first, collect instances to reconnect and then
+    -- perform actual reconnect not to modify self._connections
+    -- in-place.
+    for name, conn in pairs(self._connections) do
+        if conn._reconnect ~= nil then
+            local until_reconnect = conn._reconnect - now
+
+            if until_reconnect <= 0 then
+                table.insert(instance_names_to_reconnect, name)
+            elseif until_reconnect < until_next_reconnect then
+                until_next_reconnect = until_reconnect
+            end
+        end
+    end
+
+    for _, instance_name in ipairs(instance_names_to_reconnect) do
+        local old_conn = self._connections[instance_name]
+
+        -- The connection has not been nil when collected and
+        -- the function did not yield. It should remain non-nil.
+        -- In other words, if the connection has not been
+        -- closed as unused it remains so during the whole step.
+        --
+        -- This non-yielding logic prevents possible races
+        -- when reconnect happens at the same time when deadline
+        -- is reached.
+        assert(old_conn ~= nil)
+
+        local opts = {
+            ttl = 0,
+            wait_connected = false,
+            fetch_schema = old_conn.opts.fetch_schema,
+        }
+
+        if not is_connection_valid(old_conn, opts) then
+            local new_conn = self:connect(instance_name, opts)
+            new_conn._reconnect = now + RECONNECT_AFTER
+            if until_next_reconnect > RECONNECT_AFTER then
+                until_next_reconnect = RECONNECT_AFTER
+            end
+        else
+            old_conn._reconnect = nil
+        end
+    end
+
+    return until_next_reconnect
+end
+
+function pool_methods._failed_connection_watchdog_loop(self)
+    while true do
+        local until_next_reconnect = self:_failed_connection_watchdog_step()
+
+        if until_next_reconnect == math.huge then
+            break
+        end
+
+        if not self._failed_connection_watchdog.fired then
+            self._failed_connection_watchdog.cond:wait(until_next_reconnect)
+        end
+        self._failed_connection_watchdog.fired = false
+    end
+end
+
+function pool_methods._failed_connection_watchdog_wake(self)
+    local f = self._failed_connection_watchdog_fiber
+    if f ~= nil and f:status() ~= 'dead' then
+        self._failed_connection_watchdog.fired = true
+        self._failed_connection_watchdog.cond:signal()
+        return
+    end
+
+    f = fiber.new(self._failed_connection_watchdog_loop, self)
+    self._failed_connection_watchdog_fiber = f
+end
+
 --- Connect to an instance or receive a cached connection by
 --- name.
 ---
@@ -91,6 +175,7 @@ function pool_methods.connect(self, instance_name, opts)
 
     local conn = self._connections[instance_name]
     local old_deadline = (conn or {})._deadline
+    local old_reconnect = (conn or {})._reconnect
     if not is_connection_valid(conn, opts) then
         local uri = config:instance_uri('peer', {instance = instance_name})
         if uri == nil then
@@ -118,11 +203,17 @@ function pool_methods.connect(self, instance_name, opts)
         end
         conn.mode = mode
         conn._deadline = old_deadline
+        conn._reconnect = old_reconnect
         local function watch_status(_key, value)
             conn._mode = value.is_ro and 'ro' or 'rw'
             self._connection_mode_update_cond:broadcast()
         end
         conn:watch('box.status', watch_status)
+        local function on_disconnect()
+            conn._reconnect = clock.monotonic() + RECONNECT_AFTER
+            self:_failed_connection_watchdog_wake()
+        end
+        conn:on_disconnect(on_disconnect)
     end
 
     local idle_timeout = opts.ttl or self._idle_timeout
@@ -227,6 +318,13 @@ local function create_pool()
             cond = fiber.cond(),
         },
 
+        -- Failed connection management
+        _failed_connection_watchdog_fiber = nil,
+        _failed_connection_watchdog = {
+            fired = false,
+            cond = fiber.cond(),
+        },
+
         _idle_timeout = 60,
     }, pool_mt)
 end
diff --git a/test/config-luatest/rpc_test.lua b/test/config-luatest/rpc_test.lua
@@ -1,6 +1,7 @@
 -- tags: parallel
 
 local t = require('luatest')
+local fiber = require('fiber')
 local fun = require('fun')
 local treegen = require('luatest.treegen')
 local server = require('luatest.server')
@@ -1222,3 +1223,85 @@ g.test_closes_unused_connections = function()
         t.assert_equals(_G.connects, 4)
     end)
 end
+
+g.test_tries_to_reconnect = function()
+    local config = cbuilder:new()
+        :set_global_option('credentials.users.myuser',
+            {password =  'secret',
+             roles = { 'replication' },
+             privileges = {{permissions = {'execute'}, universe = true}}})
+        :set_global_option('iproto.advertise.peer.login', 'myuser')
+        :add_instance('i-001', { database = { mode = 'rw' } })
+        :add_instance('i-002', {})
+        :config()
+
+    local cluster = cluster:new(config)
+
+    -- The reconnect after interval for connpool is hardcoded in
+    -- connpool and equals to 3 seconds.
+    local reconnect_after = 3
+
+    -- Add a counter to count netbox.connect() calls.
+    treegen.write_file(cluster._dir, 'override/net/box.lua',
+        string.dump(function()
+            local loaders = require('internal.loaders')
+
+            rawset(_G, 'connects', 0)
+
+            local builtin_netbox = loaders.builtin['net.box']
+            local builtin_connect = builtin_netbox.connect
+            builtin_netbox.connect = function(...)
+                _G.connects = _G.connects + 1
+                return builtin_connect(...)
+            end
+
+            return builtin_netbox
+        end))
+
+    cluster:start()
+
+    cluster['i-001']:exec(function()
+        local connpool = require('experimental.connpool')
+
+        connpool.call('box.info', nil, {mode = 'ro'})
+        t.assert_equals(_G.connects, 2)
+
+        -- Save a connection to i-002.
+        rawset(_G, 'i2_conn', connpool.connect('i-002'))
+    end)
+
+    cluster['i-002']:stop()
+
+    -- The connpool should try to reconnect.
+    -- Let's skip the first attempt to make sure it tries
+    -- more than once.
+    fiber.sleep(reconnect_after + 1)
+
+    cluster['i-001']:exec(function()
+        local reconnect_after = 3
+        -- Check the saved connection is broken since i-002
+        -- has been stopped.
+        t.helpers.retrying({timeout = reconnect_after + 1}, t.assert_equals,
+                           _G.i2_conn.state, 'error')
+    end)
+
+    cluster['i-002']:start()
+    fiber.sleep(reconnect_after + 1)
+
+    cluster['i-001']:exec(function()
+        local connpool = require('experimental.connpool')
+        local reconnect_after = 3
+
+        -- Check reconnecting has been done in the background
+        -- and no new connections has been created.
+        local cur_connects = _G.connects
+        t.helpers.retrying({timeout = reconnect_after + 1}, connpool.call,
+                           'box.info', nil, {mode = 'ro'})
+        t.assert_equals(_G.connects, cur_connects)
+
+        -- A new connection to i-002 is active but the saved
+        -- one should still remain broken.
+        t.assert_equals(connpool.connect('i-002').state, 'active')
+        t.assert_equals(_G.i2_conn.state, 'error')
+    end)
+end