Skip to content

Commit f843b52

Browse files
[DPE-3559] Stabilise restore cluster test (#351)
* Improve unit status about no connection to primary Signed-off-by: Marcelo Henrique Neppel <[email protected]> * Update other users passwords in the second cluster Signed-off-by: Marcelo Henrique Neppel <[email protected]> * Add unit tests Signed-off-by: Marcelo Henrique Neppel <[email protected]> --------- Signed-off-by: Marcelo Henrique Neppel <[email protected]>
1 parent 64b65b4 commit f843b52

File tree

3 files changed

+235
-23
lines changed

3 files changed

+235
-23
lines changed

src/charm.py

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@
8989

9090
logger = logging.getLogger(__name__)
9191

92-
NO_PRIMARY_MESSAGE = "no primary in the cluster"
92+
PRIMARY_NOT_REACHABLE_MESSAGE = "waiting for primary to be reachable from this unit"
9393
EXTENSIONS_DEPENDENCY_MESSAGE = "Unsatisfied plugin dependencies. Please check the logs"
9494

9595
Scopes = Literal[APP_SCOPE, UNIT_SCOPE]
@@ -387,7 +387,7 @@ def _on_peer_relation_departed(self, event: RelationDepartedEvent) -> None:
387387
if self.primary_endpoint:
388388
self._update_relation_endpoints()
389389
else:
390-
self.unit.status = BlockedStatus(NO_PRIMARY_MESSAGE)
390+
self.unit.status = WaitingStatus(PRIMARY_NOT_REACHABLE_MESSAGE)
391391
return
392392

393393
def _on_pgdata_storage_detaching(self, _) -> None:
@@ -513,10 +513,10 @@ def _update_new_unit_status(self) -> None:
513513
# a failed switchover, so wait until the primary is elected.
514514
if self.primary_endpoint:
515515
self._update_relation_endpoints()
516-
if not self.is_blocked or self.unit.status.message == NO_PRIMARY_MESSAGE:
516+
if not self.is_blocked:
517517
self.unit.status = ActiveStatus()
518518
else:
519-
self.unit.status = BlockedStatus(NO_PRIMARY_MESSAGE)
519+
self.unit.status = WaitingStatus(PRIMARY_NOT_REACHABLE_MESSAGE)
520520

521521
def _reconfigure_cluster(self, event: HookEvent):
522522
"""Reconfigure the cluster by adding and removing members IPs to it.
@@ -765,9 +765,7 @@ def _on_cluster_topology_change(self, _):
765765
logger.info("Cluster topology changed")
766766
if self.primary_endpoint:
767767
self._update_relation_endpoints()
768-
if self.is_blocked and self.unit.status.message == NO_PRIMARY_MESSAGE:
769-
if self.primary_endpoint:
770-
self.unit.status = ActiveStatus()
768+
self.unit.status = ActiveStatus()
771769

772770
def _on_install(self, event: InstallEvent) -> None:
773771
"""Install prerequisites for the application."""
@@ -837,7 +835,7 @@ def _on_leader_elected(self, event: LeaderElectedEvent) -> None:
837835
if self.primary_endpoint:
838836
self._update_relation_endpoints()
839837
else:
840-
self.unit.status = BlockedStatus(NO_PRIMARY_MESSAGE)
838+
self.unit.status = WaitingStatus(PRIMARY_NOT_REACHABLE_MESSAGE)
841839

842840
def _on_config_changed(self, _) -> None:
843841
"""Handle configuration changes, like enabling plugins."""

tests/integration/ha_tests/test_restore_cluster.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -62,9 +62,10 @@ async def test_build_and_deploy(ops_test: OpsTest) -> None:
6262
primary = await get_primary(
6363
ops_test, ops_test.model.applications[FIRST_APPLICATION].units[0].name
6464
)
65-
password = await get_password(ops_test, primary)
66-
second_primary = ops_test.model.applications[SECOND_APPLICATION].units[0].name
67-
await set_password(ops_test, second_primary, password=password)
65+
for user in ["monitoring", "operator", "replication", "rewind"]:
66+
password = await get_password(ops_test, primary, user)
67+
second_primary = ops_test.model.applications[SECOND_APPLICATION].units[0].name
68+
await set_password(ops_test, second_primary, user, password)
6869
await ops_test.model.destroy_unit(second_primary)
6970

7071

tests/unit/test_charm.py

Lines changed: 225 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import platform
66
import subprocess
77
import unittest
8-
from unittest.mock import MagicMock, Mock, PropertyMock, mock_open, patch
8+
from unittest.mock import MagicMock, Mock, PropertyMock, call, mock_open, patch
99

1010
import pytest
1111
from charms.operator_libs_linux.v2 import snap
@@ -14,6 +14,7 @@
1414
PostgreSQLEnableDisableExtensionError,
1515
PostgreSQLUpdateUserPasswordError,
1616
)
17+
from ops import Unit
1718
from ops.framework import EventBase
1819
from ops.model import (
1920
ActiveStatus,
@@ -26,7 +27,11 @@
2627
from parameterized import parameterized
2728
from tenacity import RetryError
2829

29-
from charm import EXTENSIONS_DEPENDENCY_MESSAGE, NO_PRIMARY_MESSAGE, PostgresqlOperatorCharm
30+
from charm import (
31+
EXTENSIONS_DEPENDENCY_MESSAGE,
32+
PRIMARY_NOT_REACHABLE_MESSAGE,
33+
PostgresqlOperatorCharm,
34+
)
3035
from cluster import RemoveRaftMemberFailedError
3136
from constants import PEER, POSTGRESQL_SNAP_NAME, SECRET_INTERNAL_LABEL, SNAP_PACKAGES
3237
from tests.helpers import patch_network_get
@@ -208,12 +213,12 @@ def test_on_leader_elected(
208213
_update_relation_endpoints.assert_called_once()
209214
self.assertFalse(isinstance(self.harness.model.unit.status, BlockedStatus))
210215

211-
# Check for a BlockedStatus when there is no primary endpoint.
216+
# Check for a WaitingStatus when the primary is not reachable yet.
212217
_primary_endpoint.return_value = None
213218
self.harness.set_leader(False)
214219
self.harness.set_leader()
215220
_update_relation_endpoints.assert_called_once() # Assert it was not called again.
216-
self.assertTrue(isinstance(self.harness.model.unit.status, BlockedStatus))
221+
self.assertTrue(isinstance(self.harness.model.unit.status, WaitingStatus))
217222

218223
def test_is_cluster_initialised(self):
219224
# Test when the cluster was not initialised yet.
@@ -1270,15 +1275,14 @@ def test_on_cluster_topology_change(self, _primary_endpoint, _update_relation_en
12701275
def test_on_cluster_topology_change_keep_blocked(
12711276
self, _update_relation_endpoints, _primary_endpoint
12721277
):
1273-
self.harness.model.unit.status = BlockedStatus(NO_PRIMARY_MESSAGE)
1278+
self.harness.model.unit.status = WaitingStatus(PRIMARY_NOT_REACHABLE_MESSAGE)
12741279

12751280
self.charm._on_cluster_topology_change(Mock())
12761281

12771282
_update_relation_endpoints.assert_not_called()
1278-
self.assertEqual(_primary_endpoint.call_count, 2)
1279-
_primary_endpoint.assert_called_with()
1280-
self.assertTrue(isinstance(self.harness.model.unit.status, BlockedStatus))
1281-
self.assertEqual(self.harness.model.unit.status.message, NO_PRIMARY_MESSAGE)
1283+
_primary_endpoint.assert_called_once_with()
1284+
self.assertTrue(isinstance(self.harness.model.unit.status, WaitingStatus))
1285+
self.assertEqual(self.harness.model.unit.status.message, PRIMARY_NOT_REACHABLE_MESSAGE)
12821286

12831287
@patch(
12841288
"charm.PostgresqlOperatorCharm.primary_endpoint",
@@ -1289,13 +1293,12 @@ def test_on_cluster_topology_change_keep_blocked(
12891293
def test_on_cluster_topology_change_clear_blocked(
12901294
self, _update_relation_endpoints, _primary_endpoint
12911295
):
1292-
self.harness.model.unit.status = BlockedStatus(NO_PRIMARY_MESSAGE)
1296+
self.harness.model.unit.status = WaitingStatus(PRIMARY_NOT_REACHABLE_MESSAGE)
12931297

12941298
self.charm._on_cluster_topology_change(Mock())
12951299

12961300
_update_relation_endpoints.assert_called_once_with()
1297-
self.assertEqual(_primary_endpoint.call_count, 2)
1298-
_primary_endpoint.assert_called_with()
1301+
_primary_endpoint.assert_called_once_with()
12991302
self.assertTrue(isinstance(self.harness.model.unit.status, ActiveStatus))
13001303

13011304
@patch_network_get(private_address="1.1.1.1")
@@ -1936,3 +1939,213 @@ def test_migration_from_single_secret(self, scope, is_leader, _, __):
19361939
assert SECRET_INTERNAL_LABEL not in self.harness.get_relation_data(
19371940
self.rel_id, getattr(self.charm, scope).name
19381941
)
1942+
1943+
@patch("charm.PostgresqlOperatorCharm._update_relation_endpoints")
1944+
@patch("charm.PostgresqlOperatorCharm.primary_endpoint", new_callable=PropertyMock)
1945+
@patch("charm.PostgresqlOperatorCharm.update_config")
1946+
@patch("charm.PostgresqlOperatorCharm._remove_from_members_ips")
1947+
@patch("charm.Patroni.are_all_members_ready")
1948+
@patch("charm.PostgresqlOperatorCharm._get_ips_to_remove")
1949+
@patch("charm.PostgresqlOperatorCharm._updated_synchronous_node_count")
1950+
@patch("charm.Patroni.remove_raft_member")
1951+
@patch("charm.PostgresqlOperatorCharm._unit_ip")
1952+
@patch("charm.Patroni.get_member_ip")
1953+
def test_on_peer_relation_departed(
1954+
self,
1955+
_get_member_ip,
1956+
_unit_ip,
1957+
_remove_raft_member,
1958+
_updated_synchronous_node_count,
1959+
_get_ips_to_remove,
1960+
_are_all_members_ready,
1961+
_remove_from_members_ips,
1962+
_update_config,
1963+
_primary_endpoint,
1964+
_update_relation_endpoints,
1965+
):
1966+
# Test when the current unit is the departing unit.
1967+
self.charm.unit.status = ActiveStatus()
1968+
event = Mock()
1969+
event.departing_unit = self.harness.charm.unit
1970+
self.charm._on_peer_relation_departed(event)
1971+
_remove_raft_member.assert_not_called()
1972+
event.defer.assert_not_called()
1973+
_updated_synchronous_node_count.assert_not_called()
1974+
_get_ips_to_remove.assert_not_called()
1975+
_remove_from_members_ips.assert_not_called()
1976+
_update_config.assert_not_called()
1977+
_update_relation_endpoints.assert_not_called()
1978+
self.assertIsInstance(self.charm.unit.status, ActiveStatus)
1979+
1980+
# Test when the current unit is not the departing unit, but removing
1981+
# the member from the raft cluster fails.
1982+
_remove_raft_member.side_effect = RemoveRaftMemberFailedError
1983+
event.departing_unit = Unit(
1984+
f"{self.charm.app.name}/1", None, self.harness.charm.app._backend, {}
1985+
)
1986+
mock_ip_address = "1.1.1.1"
1987+
_get_member_ip.return_value = mock_ip_address
1988+
self.charm._on_peer_relation_departed(event)
1989+
_remove_raft_member.assert_called_once_with(mock_ip_address)
1990+
event.defer.assert_called_once()
1991+
_updated_synchronous_node_count.assert_not_called()
1992+
_get_ips_to_remove.assert_not_called()
1993+
_remove_from_members_ips.assert_not_called()
1994+
_update_config.assert_not_called()
1995+
_update_relation_endpoints.assert_not_called()
1996+
self.assertIsInstance(self.charm.unit.status, ActiveStatus)
1997+
1998+
# Test when the member is successfully removed from the raft cluster,
1999+
# but the unit is not the leader.
2000+
_remove_raft_member.reset_mock()
2001+
event.defer.reset_mock()
2002+
_remove_raft_member.side_effect = None
2003+
self.charm._on_peer_relation_departed(event)
2004+
_remove_raft_member.assert_called_once_with(mock_ip_address)
2005+
event.defer.assert_not_called()
2006+
_updated_synchronous_node_count.assert_not_called()
2007+
_get_ips_to_remove.assert_not_called()
2008+
_remove_from_members_ips.assert_not_called()
2009+
_update_config.assert_not_called()
2010+
_update_relation_endpoints.assert_not_called()
2011+
self.assertIsInstance(self.charm.unit.status, ActiveStatus)
2012+
2013+
# Test when the unit is the leader, but the cluster hasn't initialized yet,
2014+
# or it was unable to set synchronous_node_count.
2015+
_remove_raft_member.reset_mock()
2016+
with self.harness.hooks_disabled():
2017+
self.harness.set_leader()
2018+
self.charm._on_peer_relation_departed(event)
2019+
_remove_raft_member.assert_called_once_with(mock_ip_address)
2020+
event.defer.assert_called_once()
2021+
_updated_synchronous_node_count.assert_not_called()
2022+
_get_ips_to_remove.assert_not_called()
2023+
_remove_from_members_ips.assert_not_called()
2024+
_update_config.assert_not_called()
2025+
_update_relation_endpoints.assert_not_called()
2026+
self.assertIsInstance(self.charm.unit.status, ActiveStatus)
2027+
2028+
_remove_raft_member.reset_mock()
2029+
event.defer.reset_mock()
2030+
_updated_synchronous_node_count.return_value = False
2031+
with self.harness.hooks_disabled():
2032+
self.harness.update_relation_data(
2033+
self.rel_id, self.charm.app.name, {"cluster_initialised": "True"}
2034+
)
2035+
self.charm._on_peer_relation_departed(event)
2036+
_remove_raft_member.assert_called_once_with(mock_ip_address)
2037+
event.defer.assert_called_once()
2038+
_updated_synchronous_node_count.assert_called_once_with(1)
2039+
_get_ips_to_remove.assert_not_called()
2040+
_remove_from_members_ips.assert_not_called()
2041+
_update_config.assert_not_called()
2042+
_update_relation_endpoints.assert_not_called()
2043+
self.assertIsInstance(self.charm.unit.status, ActiveStatus)
2044+
2045+
# Test when there is more units in the cluster.
2046+
_remove_raft_member.reset_mock()
2047+
event.defer.reset_mock()
2048+
_updated_synchronous_node_count.reset_mock()
2049+
self.harness.add_relation_unit(self.rel_id, f"{self.charm.app.name}/2")
2050+
self.charm._on_peer_relation_departed(event)
2051+
_remove_raft_member.assert_called_once_with(mock_ip_address)
2052+
event.defer.assert_called_once()
2053+
_updated_synchronous_node_count.assert_called_once_with(2)
2054+
_get_ips_to_remove.assert_not_called()
2055+
_remove_from_members_ips.assert_not_called()
2056+
_update_config.assert_not_called()
2057+
_update_relation_endpoints.assert_not_called()
2058+
self.assertIsInstance(self.charm.unit.status, ActiveStatus)
2059+
2060+
# Test when the cluster is initialised, and it could set synchronous_node_count,
2061+
# but there is no IPs to be removed from the members list.
2062+
_remove_raft_member.reset_mock()
2063+
event.defer.reset_mock()
2064+
_updated_synchronous_node_count.reset_mock()
2065+
_updated_synchronous_node_count.return_value = True
2066+
self.charm._on_peer_relation_departed(event)
2067+
_remove_raft_member.assert_called_once_with(mock_ip_address)
2068+
event.defer.assert_not_called()
2069+
_updated_synchronous_node_count.assert_called_once_with(2)
2070+
_get_ips_to_remove.assert_called_once()
2071+
_remove_from_members_ips.assert_not_called()
2072+
_update_config.assert_not_called()
2073+
_update_relation_endpoints.assert_not_called()
2074+
self.assertIsInstance(self.charm.unit.status, ActiveStatus)
2075+
2076+
# Test when there are IPs to be removed from the members list, but not all
2077+
# the members are ready yet.
2078+
_remove_raft_member.reset_mock()
2079+
_updated_synchronous_node_count.reset_mock()
2080+
_get_ips_to_remove.reset_mock()
2081+
ips_to_remove = ["2.2.2.2", "3.3.3.3"]
2082+
_get_ips_to_remove.return_value = ips_to_remove
2083+
_are_all_members_ready.return_value = False
2084+
self.charm._on_peer_relation_departed(event)
2085+
_remove_raft_member.assert_called_once_with(mock_ip_address)
2086+
event.defer.assert_called_once()
2087+
_updated_synchronous_node_count.assert_called_once_with(2)
2088+
_get_ips_to_remove.assert_called_once()
2089+
_remove_from_members_ips.assert_not_called()
2090+
_update_config.assert_not_called()
2091+
_update_relation_endpoints.assert_not_called()
2092+
self.assertIsInstance(self.charm.unit.status, ActiveStatus)
2093+
2094+
# Test when all members are ready.
2095+
_remove_raft_member.reset_mock()
2096+
event.defer.reset_mock()
2097+
_updated_synchronous_node_count.reset_mock()
2098+
_get_ips_to_remove.reset_mock()
2099+
_are_all_members_ready.return_value = True
2100+
self.charm._on_peer_relation_departed(event)
2101+
_remove_raft_member.assert_called_once_with(mock_ip_address)
2102+
event.defer.assert_not_called()
2103+
_updated_synchronous_node_count.assert_called_once_with(2)
2104+
_get_ips_to_remove.assert_called_once()
2105+
_remove_from_members_ips.assert_has_calls([call(ips_to_remove[0]), call(ips_to_remove[1])])
2106+
self.assertEqual(_update_config.call_count, 2)
2107+
self.assertEqual(_update_relation_endpoints.call_count, 2)
2108+
self.assertIsInstance(self.charm.unit.status, ActiveStatus)
2109+
2110+
# Test when the primary is not reachable yet.
2111+
_remove_raft_member.reset_mock()
2112+
event.defer.reset_mock()
2113+
_updated_synchronous_node_count.reset_mock()
2114+
_get_ips_to_remove.reset_mock()
2115+
_remove_from_members_ips.reset_mock()
2116+
_update_config.reset_mock()
2117+
_update_relation_endpoints.reset_mock()
2118+
_primary_endpoint.return_value = None
2119+
self.charm._on_peer_relation_departed(event)
2120+
_remove_raft_member.assert_called_once_with(mock_ip_address)
2121+
event.defer.assert_not_called()
2122+
_updated_synchronous_node_count.assert_called_once_with(2)
2123+
_get_ips_to_remove.assert_called_once()
2124+
_remove_from_members_ips.assert_called_once()
2125+
_update_config.assert_called_once()
2126+
_update_relation_endpoints.assert_not_called()
2127+
self.assertIsInstance(self.charm.unit.status, WaitingStatus)
2128+
2129+
@patch("charm.PostgresqlOperatorCharm._update_relation_endpoints")
2130+
@patch("charm.PostgresqlOperatorCharm.primary_endpoint", new_callable=PropertyMock)
2131+
def test_update_new_unit_status(self, _primary_endpoint, _update_relation_endpoints):
2132+
# Test when the charm is blocked.
2133+
_primary_endpoint.return_value = "endpoint"
2134+
self.charm.unit.status = BlockedStatus("fake blocked status")
2135+
self.charm._update_new_unit_status()
2136+
_update_relation_endpoints.assert_called_once()
2137+
self.assertIsInstance(self.charm.unit.status, BlockedStatus)
2138+
2139+
# Test when the charm is not blocked.
2140+
_update_relation_endpoints.reset_mock()
2141+
self.charm.unit.status = WaitingStatus()
2142+
self.charm._update_new_unit_status()
2143+
_update_relation_endpoints.assert_called_once()
2144+
self.assertIsInstance(self.charm.unit.status, ActiveStatus)
2145+
2146+
# Test when the primary endpoint is not reachable yet.
2147+
_update_relation_endpoints.reset_mock()
2148+
_primary_endpoint.return_value = None
2149+
self.charm._update_new_unit_status()
2150+
_update_relation_endpoints.assert_not_called()
2151+
self.assertIsInstance(self.charm.unit.status, WaitingStatus)

0 commit comments

Comments
 (0)