Skip to content

Commit c9f6554

Browse files
authored
[DPE-6484] Add scope to promote to primary (#850)
* Promote unit action * Tweaks for failing REST calls * VM parity
1 parent 842aa47 commit c9f6554

File tree

9 files changed

+114
-18
lines changed

9 files changed

+114
-18
lines changed

actions.yaml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,13 @@ list-backups:
3434
pre-upgrade-check:
3535
description: Run necessary pre-upgrade checks and preparations before executing a charm refresh.
3636
promote-to-primary:
37-
description: Promotes the cluster of choice to a primary cluster. Must be ran against the leader unit.
37+
description: Promotes the cluster of choice to a primary cluster. Must be ran against the leader unit when promoting a cluster
38+
or against the unit to be promoted within the cluster.
3839
params:
40+
scope:
41+
type: string
42+
default: cluster
43+
description: Whether to promote a unit or a cluster. Must be set to either unit or cluster.
3944
force:
4045
type: boolean
4146
description: Force the promotion of a cluster when there is already a primary cluster.

src/charm.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@
112112
WORKLOAD_OS_GROUP,
113113
WORKLOAD_OS_USER,
114114
)
115-
from patroni import NotReadyError, Patroni, SwitchoverFailedError
115+
from patroni import NotReadyError, Patroni, SwitchoverFailedError, SwitchoverNotSyncError
116116
from relations.async_replication import (
117117
REPLICATION_CONSUMER_RELATION,
118118
REPLICATION_OFFER_RELATION,
@@ -211,6 +211,7 @@ def __init__(self, *args):
211211
self.framework.observe(self.on.stop, self._on_stop)
212212
self.framework.observe(self.on.get_password_action, self._on_get_password)
213213
self.framework.observe(self.on.set_password_action, self._on_set_password)
214+
self.framework.observe(self.on.promote_to_primary_action, self._on_promote_to_primary)
214215
self.framework.observe(self.on.get_primary_action, self._on_get_primary)
215216
self.framework.observe(self.on.update_status, self._on_update_status)
216217
self._storage_path = self.meta.storages["pgdata"].location
@@ -1305,6 +1306,26 @@ def _on_set_password(self, event: ActionEvent) -> None:
13051306

13061307
event.set_results({"password": password})
13071308

1309+
def _on_promote_to_primary(self, event: ActionEvent) -> None:
1310+
if event.params.get("scope") == "cluster":
1311+
return self.async_replication.promote_to_primary(event)
1312+
elif event.params.get("scope") == "unit":
1313+
return self.promote_primary_unit(event)
1314+
else:
1315+
event.fail("Scope should be either cluster or unit")
1316+
1317+
def promote_primary_unit(self, event: ActionEvent) -> None:
1318+
"""Handles promote to primary for unit scope."""
1319+
if event.params.get("force"):
1320+
event.fail("Suprerfluous force flag with unit scope")
1321+
else:
1322+
try:
1323+
self._patroni.switchover(self.unit.name, wait=False)
1324+
except SwitchoverNotSyncError:
1325+
event.fail("Unit is not sync standby")
1326+
except SwitchoverFailedError:
1327+
event.fail("Switchover failed or timed out, check the logs for details")
1328+
13081329
def _on_get_primary(self, event: ActionEvent) -> None:
13091330
"""Get primary instance."""
13101331
try:

src/patroni.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,10 @@ class SwitchoverFailedError(Exception):
5353
"""Raised when a switchover failed for some reason."""
5454

5555

56+
class SwitchoverNotSyncError(SwitchoverFailedError):
57+
"""Raised when a switchover failed because node is not sync."""
58+
59+
5660
class UpdateSyncNodeCountError(Exception):
5761
"""Raised when updating synchronous_node_count failed for some reason."""
5862

@@ -612,7 +616,7 @@ def restart_postgresql(self) -> None:
612616
timeout=PATRONI_TIMEOUT,
613617
)
614618

615-
def switchover(self, candidate: str | None = None) -> None:
619+
def switchover(self, candidate: str | None = None, wait: bool = True) -> None:
616620
"""Trigger a switchover."""
617621
# Try to trigger the switchover.
618622
if candidate is not None:
@@ -631,8 +635,18 @@ def switchover(self, candidate: str | None = None) -> None:
631635

632636
# Check whether the switchover was unsuccessful.
633637
if r.status_code != 200:
638+
if (
639+
r.status_code == 412
640+
and r.text == "candidate name does not match with sync_standby"
641+
):
642+
logger.debug("Unit is not sync standby")
643+
raise SwitchoverNotSyncError()
644+
logger.warning(f"Switchover call failed with code {r.status_code} {r.text}")
634645
raise SwitchoverFailedError(f"received {r.status_code}")
635646

647+
if not wait:
648+
return
649+
636650
for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(3), reraise=True):
637651
with attempt:
638652
new_primary = self.get_primary()

src/relations/async_replication.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -104,9 +104,6 @@ def __init__(self, charm):
104104
self.framework.observe(
105105
self.charm.on.create_replication_action, self._on_create_replication
106106
)
107-
self.framework.observe(
108-
self.charm.on.promote_to_primary_action, self._on_promote_to_primary
109-
)
110107

111108
self.framework.observe(self.charm.on.secret_changed, self._on_secret_changed)
112109

@@ -575,7 +572,7 @@ def _on_create_replication(self, event: ActionEvent) -> None:
575572
# Set the status.
576573
self.charm.unit.status = MaintenanceStatus("Creating replication...")
577574

578-
def _on_promote_to_primary(self, event: ActionEvent) -> None:
575+
def promote_to_primary(self, event: ActionEvent) -> None:
579576
"""Promote this cluster to the primary cluster."""
580577
if (
581578
self.charm.app.status.message != READ_ONLY_MODE_BLOCKING_MESSAGE

tests/integration/ha_tests/test_async_replication.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -240,7 +240,7 @@ async def test_switchover(
240240
leader_unit = await get_leader_unit(ops_test, DATABASE_APP_NAME, model=second_model)
241241
assert leader_unit is not None, "No leader unit found"
242242
logger.info("promoting the second cluster")
243-
run_action = await leader_unit.run_action("promote-to-primary", **{"force": True})
243+
run_action = await leader_unit.run_action("promote-to-primary", force=True, scope="cluster")
244244
await run_action.wait()
245245
assert (run_action.results.get("return-code", None) == 0) or (
246246
run_action.results.get("Code", None) == "0"
@@ -295,7 +295,7 @@ async def test_promote_standby(
295295
leader_unit = await get_leader_unit(ops_test, DATABASE_APP_NAME)
296296
assert leader_unit is not None, "No leader unit found"
297297
logger.info("promoting the first cluster")
298-
run_action = await leader_unit.run_action("promote-to-primary")
298+
run_action = await leader_unit.run_action("promote-to-primary", scope="cluster")
299299
await run_action.wait()
300300
assert (run_action.results.get("return-code", None) == 0) or (
301301
run_action.results.get("Code", None) == "0"

tests/integration/test_charm.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from pytest_operator.plugin import OpsTest
1414
from tenacity import Retrying, stop_after_delay, wait_fixed
1515

16+
from .ha_tests.helpers import get_cluster_roles
1617
from .helpers import (
1718
CHARM_BASE,
1819
METADATA,
@@ -253,6 +254,21 @@ async def test_scale_down_and_up(ops_test: OpsTest):
253254
await scale_application(ops_test, APP_NAME, initial_scale)
254255

255256

257+
async def test_switchover_sync_standby(ops_test: OpsTest):
258+
original_roles = await get_cluster_roles(
259+
ops_test, ops_test.model.applications[APP_NAME].units[0].name
260+
)
261+
run_action = await ops_test.model.units[original_roles["sync_standbys"][0]].run_action(
262+
"promote-to-primary", scope="unit"
263+
)
264+
await run_action.wait()
265+
await ops_test.model.wait_for_idle(status="active", timeout=200)
266+
new_roles = await get_cluster_roles(
267+
ops_test, ops_test.model.applications[APP_NAME].units[0].name
268+
)
269+
assert new_roles["primaries"][0] == original_roles["sync_standbys"][0]
270+
271+
256272
async def test_persist_data_through_graceful_restart(ops_test: OpsTest):
257273
"""Test data persists through a graceful restart."""
258274
primary = await get_primary(ops_test)

tests/unit/test_async_replication.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -314,7 +314,7 @@ def test_promote_to_primary(harness, relation_name):
314314
)
315315
harness.update_relation_data(rel_id, "standby/0", {"unit-address": "10.2.2.10"})
316316

317-
harness.run_action("promote-to-primary")
317+
harness.run_action("promote-to-primary", {"scope": "cluster"})
318318

319319
assert (
320320
harness.get_relation_data(rel_id, harness.charm.app.name).get(

tests/unit/test_charm.py

Lines changed: 40 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626

2727
from charm import EXTENSION_OBJECT_MESSAGE, PostgresqlOperatorCharm
2828
from constants import PEER, SECRET_INTERNAL_LABEL
29-
from patroni import NotReadyError
29+
from patroni import NotReadyError, SwitchoverFailedError, SwitchoverNotSyncError
3030
from tests.unit.helpers import _FakeApiError
3131

3232
POSTGRESQL_CONTAINER = "postgresql"
@@ -1847,3 +1847,42 @@ def test_get_plugins(harness):
18471847
"insert_username",
18481848
"moddatetime",
18491849
]
1850+
1851+
1852+
def test_on_promote_to_primary(harness):
1853+
with (
1854+
patch("charm.PostgreSQLAsyncReplication.promote_to_primary") as _promote_to_primary,
1855+
patch("charm.Patroni.switchover") as _switchover,
1856+
):
1857+
event = Mock()
1858+
event.params = {"scope": "cluster"}
1859+
1860+
# Cluster
1861+
harness.charm._on_promote_to_primary(event)
1862+
_promote_to_primary.assert_called_once_with(event)
1863+
1864+
# Unit, no force, regular promotion
1865+
event.params = {"scope": "unit"}
1866+
1867+
harness.charm._on_promote_to_primary(event)
1868+
1869+
_switchover.assert_called_once_with("postgresql-k8s/0", wait=False)
1870+
1871+
# Unit, no force, switchover failed
1872+
event.params = {"scope": "unit"}
1873+
_switchover.side_effect = SwitchoverFailedError
1874+
1875+
harness.charm._on_promote_to_primary(event)
1876+
1877+
event.fail.assert_called_once_with(
1878+
"Switchover failed or timed out, check the logs for details"
1879+
)
1880+
event.fail.reset_mock()
1881+
1882+
# Unit, no force, not sync
1883+
event.params = {"scope": "unit"}
1884+
_switchover.side_effect = SwitchoverNotSyncError
1885+
1886+
harness.charm._on_promote_to_primary(event)
1887+
1888+
event.fail.assert_called_once_with("Unit is not sync standby")

tests/unit/test_patroni.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313

1414
from charm import PostgresqlOperatorCharm
1515
from constants import REWIND_USER
16-
from patroni import PATRONI_TIMEOUT, Patroni, SwitchoverFailedError
16+
from patroni import PATRONI_TIMEOUT, Patroni, SwitchoverFailedError, SwitchoverNotSyncError
1717
from tests.helpers import STORAGE_PATH
1818

1919

@@ -331,11 +331,9 @@ def test_switchover(harness, patroni):
331331
# Test failed switchovers.
332332
_post.reset_mock()
333333
_get_primary.side_effect = ["postgresql-k8s-0", "postgresql-k8s-1"]
334-
try:
334+
with pytest.raises(SwitchoverFailedError):
335335
patroni.switchover("postgresql-k8s/2")
336336
assert False
337-
except SwitchoverFailedError:
338-
pass
339337
_post.assert_called_once_with(
340338
"http://postgresql-k8s-0:8008/switchover",
341339
json={"leader": "postgresql-k8s-0", "candidate": "postgresql-k8s-2"},
@@ -347,11 +345,9 @@ def test_switchover(harness, patroni):
347345
_post.reset_mock()
348346
_get_primary.side_effect = ["postgresql-k8s-0", "postgresql-k8s-2"]
349347
response.status_code = 400
350-
try:
348+
with pytest.raises(SwitchoverFailedError):
351349
patroni.switchover("postgresql-k8s/2")
352350
assert False
353-
except SwitchoverFailedError:
354-
pass
355351
_post.assert_called_once_with(
356352
"http://postgresql-k8s-0:8008/switchover",
357353
json={"leader": "postgresql-k8s-0", "candidate": "postgresql-k8s-2"},
@@ -360,6 +356,14 @@ def test_switchover(harness, patroni):
360356
timeout=PATRONI_TIMEOUT,
361357
)
362358

359+
# Test candidate, not sync
360+
response = _post.return_value
361+
response.status_code = 412
362+
response.text = "candidate name does not match with sync_standby"
363+
with pytest.raises(SwitchoverNotSyncError):
364+
patroni.switchover("candidate")
365+
assert False
366+
363367

364368
def test_member_replication_lag(harness, patroni):
365369
with (

0 commit comments

Comments
 (0)