Failsafe mode wasn't always triggered in case of Etcd unavailability (patroni#3404)

CyberDem0n · web-flow · commit 29532763284b · 2025-07-11T18:01:31.000+02:00
During heartbeat cycle Patroni does two requests to Etcd: 1. get_cluster() 2. update_lock() If request fails with one Etcd node Patroni switches to another node and retries. At the same time it sets a flag that Etcd topology must be rediscovered. Rediscovery happens either after successfully completing current request or before executing the next request. In the second case etcd.EtcdException raised by topology discovery functions wasn't handled and as a result of that failsafe_mode wasn't triggered. Close patroni#3403
diff --git a/patroni/dcs/etcd.py b/patroni/dcs/etcd.py
@@ -317,7 +317,12 @@ def api_execute(self, path: str, method: str, params: Optional[Dict[str, Any]] =
 
         # Update machines_cache if previous attempt of update has failed
         if self._update_machines_cache:
-            self._load_machines_cache()
+            try:
+                self._load_machines_cache()
+            except etcd.EtcdException as e:
+                # If etcd cluster isn't accessible _load_machines_cache() -> _refresh_machines_cache() may raise
+                # etcd.EtcdException. We need to convert it to etcd.EtcdConnectionFailed for failsafe_mode to work.
+                raise etcd.EtcdConnectionFailed('No more machines in the cluster') from e
         elif not self._use_proxies and time.time() - self._machines_cache_updated > self._machines_cache_ttl:
             self._refresh_machines_cache()
 
diff --git a/tests/test_etcd.py b/tests/test_etcd.py
@@ -211,6 +211,9 @@ def test_api_execute(self):
                 patch.object(EtcdClient, '_load_machines_cache', Mock(return_value=True)):
             self.assertRaises(etcd.EtcdException, rtry, self.client.api_execute, '/', 'GET', params={'retry': rtry})
 
+        with patch.object(EtcdClient, '_get_machines_list', Mock(side_effect=etcd.EtcdConnectionFailed)):
+            self.assertRaises(etcd.EtcdConnectionFailed, self.client.api_execute, '/', 'GET')
+
         with patch.object(EtcdClient, '_do_http_request', Mock(side_effect=etcd.EtcdException)):
             self.client._read_timeout = 0.01
             self.assertRaises(etcd.EtcdException, self.client.api_execute, '/', 'GET')