Skip to content

Commit 1c11581

Browse files
DPE-7726: Avoid pending_restart=True flag flickering
The current Patroni 3.2.2 has wired/flickering behaviour: it temporary flag pending_restart=True on many changes to REST API, which is gone within a second but long enough to be cougth by charm. Sleepping a bit is a necessary evil, until Patroni 3.3.0 upgrade. The previous code sleept for 15 seconds waiting for pg_settings update. Also, the unnecessary restarts could be triggered by missmatch of Patroni config file and in-memory changes coming from REST API, e.g. the slots were undefined in yaml file but set as an empty JSON {} => None. Updating the default template to match the default API PATCHes and avoid restarts.
1 parent f1c4356 commit 1c11581

File tree

4 files changed

+27
-14
lines changed

4 files changed

+27
-14
lines changed

src/charm.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2365,7 +2365,7 @@ def update_config(
23652365
parameters=pg_parameters,
23662366
no_peers=no_peers,
23672367
user_databases_map=self.relations_user_databases_map,
2368-
slots=replication_slots or None,
2368+
slots=replication_slots,
23692369
)
23702370
if no_peers:
23712371
return True

src/cluster.py

Lines changed: 23 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
from contextlib import suppress
1717
from pathlib import Path
1818
from ssl import CERT_NONE, create_default_context
19+
from time import sleep
1920
from typing import TYPE_CHECKING, Any, TypedDict
2021

2122
import psutil
@@ -466,18 +467,32 @@ def get_patroni_health(self) -> dict[str, str]:
466467

467468
def is_restart_pending(self) -> bool:
468469
"""Returns whether the Patroni/PostgreSQL restart pending."""
469-
patroni_status = requests.get(
470+
pending_restart = self._get_patroni_restart_pending()
471+
if pending_restart:
472+
# The current Patroni 3.2.2 has wired behaviour: it temporary flag pending_restart=True
473+
# on any changes to REST API, which is gone within a second but long enough to be
474+
# cougth by charm. Sleep 2 seconds as a protection here until Patroni 3.3.0 upgrade.
475+
# Repeat the request to make sure pending_restart flag is still here
476+
logger.debug("Enduring restart is pending (to avoid unnecessary rolling restarts)")
477+
sleep(2)
478+
pending_restart = self._get_patroni_restart_pending()
479+
480+
return pending_restart
481+
482+
def _get_patroni_restart_pending(self) -> bool:
483+
"""Returns whether the Patroni flag pending_restart on REST API."""
484+
r = requests.get(
470485
f"{self._patroni_url}/patroni",
471486
verify=self.verify,
472487
timeout=API_REQUEST_TIMEOUT,
473488
auth=self._patroni_auth,
474489
)
475-
try:
476-
pending_restart = patroni_status.json()["pending_restart"]
477-
except KeyError:
478-
pending_restart = False
479-
pass
480-
logger.debug(f"Patroni API is_restart_pending: {pending_restart}")
490+
pending_restart = r.json().get("pending_restart", False)
491+
logger.debug(
492+
f"API _get_patroni_restart_pending ({pending_restart}): %s (%s)",
493+
r,
494+
r.elapsed.total_seconds(),
495+
)
481496

482497
return pending_restart
483498

@@ -1083,7 +1098,7 @@ def ensure_slots_controller_by_patroni(self, slots: dict[str, str]) -> None:
10831098
f"Failed to get current Patroni config: {current_config.status_code} {current_config.text}"
10841099
)
10851100
slots_patch: dict[str, dict[str, str] | None] = dict.fromkeys(
1086-
current_config.json().get("slots", ())
1101+
current_config.json().get("slots", ()) or {}
10871102
)
10881103
for slot, database in slots.items():
10891104
slots_patch[slot] = {

templates/patroni.yml.j2

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -101,15 +101,13 @@ bootstrap:
101101
set_user.block_log_statement: 'on'
102102
set_user.exit_on_error: 'on'
103103
set_user.superuser_allowlist: '+charmed_dba'
104-
{%- if slots %}
105104
slots:
106105
{%- for slot, database in slots.items() %}
107106
{{slot}}:
108107
database: {{database}}
109108
plugin: pgoutput
110109
type: logical
111110
{%- endfor -%}
112-
{% endif %}
113111

114112
{%- if restoring_backup %}
115113
method: pgbackrest
@@ -202,7 +200,7 @@ postgresql:
202200
{%- endfor %}
203201
{%- for peer_ip in peers_ips %}
204202
- {{ 'hostssl' if enable_tls else 'host' }} replication replication {{ peer_ip }}/0 md5
205-
{% endfor %}
203+
{%- endfor %}
206204
pg_ident:
207205
- operator _daemon_ backup
208206
authentication:

tests/unit/test_charm.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1162,7 +1162,7 @@ class _MockSnap:
11621162
parameters={"test": "test"},
11631163
no_peers=False,
11641164
user_databases_map={"operator": "all", "replication": "all", "rewind": "all"},
1165-
slots=None,
1165+
slots={},
11661166
)
11671167
_handle_postgresql_restart_need.assert_called_once_with()
11681168
_restart_ldap_sync_service.assert_called_once()
@@ -1193,7 +1193,7 @@ class _MockSnap:
11931193
parameters={"test": "test"},
11941194
no_peers=False,
11951195
user_databases_map={"operator": "all", "replication": "all", "rewind": "all"},
1196-
slots=None,
1196+
slots={},
11971197
)
11981198
_handle_postgresql_restart_need.assert_called_once()
11991199
_restart_ldap_sync_service.assert_called_once()

0 commit comments

Comments
 (0)