Skip to content

Commit 31ca568

Browse files
[DPE-2728] Handle scaling to zero units (#331)
* Handle scaling to zero units Signed-off-by: Marcelo Henrique Neppel <[email protected]> * Update units tests Signed-off-by: Marcelo Henrique Neppel <[email protected]> * Remove unused constants Signed-off-by: Marcelo Henrique Neppel <[email protected]> * Don't set unknown status Signed-off-by: Marcelo Henrique Neppel <[email protected]> --------- Signed-off-by: Marcelo Henrique Neppel <[email protected]>
1 parent f399814 commit 31ca568

File tree

8 files changed

+290
-31
lines changed

8 files changed

+290
-31
lines changed

src/charm.py

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
MaintenanceStatus,
4343
Relation,
4444
Unit,
45+
UnknownStatus,
4546
WaitingStatus,
4647
)
4748
from ops.pebble import ChangeError, Layer, PathError, ProtocolError, ServiceStatus
@@ -498,12 +499,14 @@ def enable_disable_extensions(self, database: str = None) -> None:
498499
continue
499500
extension = plugins_exception.get(extension, extension)
500501
extensions[extension] = enable
501-
self.unit.status = WaitingStatus("Updating extensions")
502+
if not isinstance(original_status, UnknownStatus):
503+
self.unit.status = WaitingStatus("Updating extensions")
502504
try:
503505
self.postgresql.enable_disable_extensions(extensions, database)
504506
except PostgreSQLEnableDisableExtensionError as e:
505507
logger.exception("failed to change plugins: %s", str(e))
506-
self.unit.status = original_status
508+
if not isinstance(original_status, UnknownStatus):
509+
self.unit.status = original_status
507510

508511
def _add_members(self, event) -> None:
509512
"""Add new cluster members.
@@ -603,6 +606,23 @@ def _on_leader_elected(self, event: LeaderElectedEvent) -> None:
603606
self.set_secret(APP_SCOPE, MONITORING_PASSWORD_KEY, new_password())
604607

605608
self._cleanup_old_cluster_resources()
609+
client = Client()
610+
try:
611+
endpoint = client.get(Endpoints, name=self.cluster_name, namespace=self._namespace)
612+
if "leader" not in endpoint.metadata.annotations:
613+
patch = {
614+
"metadata": {
615+
"annotations": {"leader": self._unit_name_to_pod_name(self._unit)}
616+
}
617+
}
618+
client.patch(
619+
Endpoints, name=self.cluster_name, namespace=self._namespace, obj=patch
620+
)
621+
self.app_peer_data.pop("cluster_initialised", None)
622+
except ApiError as e:
623+
# Ignore the error only when the resource doesn't exist.
624+
if e.status.code != 404:
625+
raise e
606626

607627
# Create resources and add labels needed for replication.
608628
try:
@@ -931,6 +951,11 @@ def _on_get_primary(self, event: ActionEvent) -> None:
931951
logger.error(f"failed to get primary with error {e}")
932952

933953
def _on_stop(self, _):
954+
# Remove data from the drive when scaling down to zero to prevent
955+
# the cluster from getting stuck when scaling back up.
956+
if self.app.planned_units() == 0:
957+
self.unit_peer_data.clear()
958+
934959
# Patch the services to remove them when the StatefulSet is deleted
935960
# (i.e. application is removed).
936961
try:

src/relations/db.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -244,6 +244,13 @@ def _on_relation_departed(self, event: RelationDepartedEvent) -> None:
244244
event.defer()
245245
return
246246

247+
# Set a flag to avoid deleting database users when this unit
248+
# is removed and receives relation broken events from related applications.
249+
# This is needed because of https://bugs.launchpad.net/juju/+bug/1979811.
250+
if event.departing_unit == self.charm.unit:
251+
self.charm._peers.data[self.charm.unit].update({"departing": "True"})
252+
return
253+
247254
if not self.charm.unit.is_leader():
248255
return
249256

@@ -275,6 +282,10 @@ def _on_relation_broken(self, event: RelationBrokenEvent) -> None:
275282
event.defer()
276283
return
277284

285+
if "departing" in self.charm._peers.data[self.charm.unit]:
286+
logger.debug("Early exit on_relation_broken: Skipping departing unit")
287+
return
288+
278289
if not self.charm.unit.is_leader():
279290
return
280291

src/relations/postgresql_provider.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
PostgreSQLDeleteUserError,
1818
PostgreSQLGetPostgreSQLVersionError,
1919
)
20-
from ops.charm import CharmBase, RelationBrokenEvent
20+
from ops.charm import CharmBase, RelationBrokenEvent, RelationDepartedEvent
2121
from ops.framework import Object
2222
from ops.model import ActiveStatus, BlockedStatus, Relation
2323

@@ -45,6 +45,9 @@ def __init__(self, charm: CharmBase, relation_name: str = "database") -> None:
4545
self.relation_name = relation_name
4646

4747
super().__init__(charm, self.relation_name)
48+
self.framework.observe(
49+
charm.on[self.relation_name].relation_departed, self._on_relation_departed
50+
)
4851
self.framework.observe(
4952
charm.on[self.relation_name].relation_broken, self._on_relation_broken
5053
)
@@ -128,6 +131,14 @@ def _on_database_requested(self, event: DatabaseRequestedEvent) -> None:
128131
else f"Failed to initialize {self.relation_name} relation"
129132
)
130133

134+
def _on_relation_departed(self, event: RelationDepartedEvent) -> None:
135+
"""Set a flag to avoid deleting database users when not wanted."""
136+
# Set a flag to avoid deleting database users when this unit
137+
# is removed and receives relation broken events from related applications.
138+
# This is needed because of https://bugs.launchpad.net/juju/+bug/1979811.
139+
if event.departing_unit == self.charm.unit:
140+
self.charm._peers.data[self.charm.unit].update({"departing": "True"})
141+
131142
def _on_relation_broken(self, event: RelationBrokenEvent) -> None:
132143
"""Remove the user created for this relation."""
133144
# Check for some conditions before trying to access the PostgreSQL instance.
@@ -144,6 +155,10 @@ def _on_relation_broken(self, event: RelationBrokenEvent) -> None:
144155

145156
self._update_unit_status(event.relation)
146157

158+
if "departing" in self.charm._peers.data[self.charm.unit]:
159+
logger.debug("Early exit on_relation_broken: Skipping departing unit")
160+
return
161+
147162
if not self.charm.unit.is_leader():
148163
return
149164

tests/integration/ha_tests/test_self_healing.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
get_password,
4040
get_unit_address,
4141
run_command_on_unit,
42+
scale_application,
4243
)
4344

4445
logger = logging.getLogger(__name__)
@@ -387,3 +388,43 @@ async def test_network_cut(
387388
), "Connection is not possible after network restore"
388389

389390
await is_cluster_updated(ops_test, primary_name)
391+
392+
393+
async def test_scaling_to_zero(ops_test: OpsTest, continuous_writes) -> None:
394+
"""Scale the database to zero units and scale up again."""
395+
# Locate primary unit.
396+
app = await app_name(ops_test)
397+
398+
# Start an application that continuously writes data to the database.
399+
await start_continuous_writes(ops_test, app)
400+
401+
# Scale the database to zero units.
402+
logger.info("scaling database to zero units")
403+
await scale_application(ops_test, app, 0)
404+
405+
# Scale the database to three units.
406+
logger.info("scaling database to three units")
407+
await scale_application(ops_test, app, 3)
408+
409+
# Verify all units are up and running.
410+
logger.info("waiting for the database service to start in all units")
411+
for unit in ops_test.model.applications[app].units:
412+
assert await is_postgresql_ready(
413+
ops_test, unit.name
414+
), f"unit {unit.name} not restarted after cluster restart."
415+
416+
logger.info("checking whether writes are increasing")
417+
await are_writes_increasing(ops_test)
418+
419+
# Verify that all units are part of the same cluster.
420+
logger.info("checking whether all units are part of the same cluster")
421+
member_ips = await fetch_cluster_members(ops_test)
422+
ip_addresses = [
423+
await get_unit_address(ops_test, unit.name)
424+
for unit in ops_test.model.applications[app].units
425+
]
426+
assert set(member_ips) == set(ip_addresses), "not all units are part of the same cluster."
427+
428+
# Verify that no writes to the database were missed after stopping the writes.
429+
logger.info("checking whether no writes to the database were missed after stopping the writes")
430+
await check_writes(ops_test)

tests/integration/helpers.py

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -637,12 +637,18 @@ async def scale_application(ops_test: OpsTest, application_name: str, scale: int
637637
scale: The number of units to scale to
638638
"""
639639
await ops_test.model.applications[application_name].scale(scale)
640-
await ops_test.model.wait_for_idle(
641-
apps=[application_name],
642-
status="active",
643-
timeout=1000,
644-
wait_for_exact_units=scale,
645-
)
640+
if scale == 0:
641+
await ops_test.model.block_until(
642+
lambda: len(ops_test.model.applications[DATABASE_APP_NAME].units) == scale,
643+
timeout=1000,
644+
)
645+
else:
646+
await ops_test.model.wait_for_idle(
647+
apps=[application_name],
648+
status="active",
649+
timeout=1000,
650+
wait_for_exact_units=scale,
651+
)
646652

647653

648654
async def set_password(

tests/unit/test_charm.py

Lines changed: 85 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ def setUp(self):
4040
self.addCleanup(self.harness.cleanup)
4141
self.harness.begin()
4242
self.charm = self.harness.charm
43+
self._cluster_name = f"patroni-{self.charm.app.name}"
4344
self._context = {
4445
"namespace": self.harness.model.name,
4546
"app_name": self.harness.model.app.name,
@@ -51,27 +52,75 @@ def setUp(self):
5152
def use_caplog(self, caplog):
5253
self._caplog = caplog
5354

55+
@patch("charm.PostgresqlOperatorCharm._add_members")
56+
@patch("charm.Client")
5457
@patch("charm.new_password", return_value="sekr1t")
5558
@patch("charm.PostgresqlOperatorCharm.get_secret", return_value=None)
5659
@patch("charm.PostgresqlOperatorCharm.set_secret")
5760
@patch("charm.Patroni.reload_patroni_configuration")
5861
@patch("charm.PostgresqlOperatorCharm._patch_pod_labels")
5962
@patch("charm.PostgresqlOperatorCharm._create_services")
60-
def test_on_leader_elected(self, _, __, ___, _set_secret, _get_secret, _____):
61-
# Check that a new password was generated on leader election.
63+
def test_on_leader_elected(self, _, __, ___, _set_secret, _get_secret, _____, _client, ______):
64+
# Check that a new password was generated on leader election and nothing is done
65+
# because the "leader" key is present in the endpoint annotations due to a scale
66+
# down to zero units.
67+
with self.harness.hooks_disabled():
68+
self.harness.update_relation_data(
69+
self.rel_id, self.charm.app.name, {"cluster_initialised": "True"}
70+
)
71+
_client.return_value.get.return_value = MagicMock(
72+
metadata=MagicMock(annotations=["leader"])
73+
)
74+
_client.return_value.list.side_effect = [
75+
[MagicMock(metadata=MagicMock(name="fakeName1", namespace="fakeNamespace"))],
76+
[MagicMock(metadata=MagicMock(name="fakeName2", namespace="fakeNamespace"))],
77+
]
6278
self.harness.set_leader()
6379
assert _set_secret.call_count == 4
6480
_set_secret.assert_any_call("app", "operator-password", "sekr1t")
6581
_set_secret.assert_any_call("app", "replication-password", "sekr1t")
6682
_set_secret.assert_any_call("app", "rewind-password", "sekr1t")
6783
_set_secret.assert_any_call("app", "monitoring-password", "sekr1t")
84+
_client.return_value.get.assert_called_once_with(
85+
Endpoints, name=self._cluster_name, namespace=self.charm.model.name
86+
)
87+
_client.return_value.patch.assert_not_called()
88+
self.assertIn(
89+
"cluster_initialised", self.harness.get_relation_data(self.rel_id, self.charm.app)
90+
)
6891

69-
# Trigger a new leader election and check that the password is still the same.
92+
# Trigger a new leader election and check that the password is still the same, and that the charm
93+
# fixes the missing "leader" key in the endpoint annotations.
94+
_client.reset_mock()
95+
_client.return_value.get.return_value = MagicMock(metadata=MagicMock(annotations=[]))
7096
_set_secret.reset_mock()
7197
_get_secret.return_value = "test"
7298
self.harness.set_leader(False)
7399
self.harness.set_leader()
74100
assert _set_secret.call_count == 0
101+
_client.return_value.get.assert_called_once_with(
102+
Endpoints, name=self._cluster_name, namespace=self.charm.model.name
103+
)
104+
_client.return_value.patch.assert_called_once_with(
105+
Endpoints,
106+
name=self._cluster_name,
107+
namespace=self.charm.model.name,
108+
obj={"metadata": {"annotations": {"leader": "postgresql-k8s-0"}}},
109+
)
110+
self.assertNotIn(
111+
"cluster_initialised", self.harness.get_relation_data(self.rel_id, self.charm.app)
112+
)
113+
114+
# Test a failure in fixing the "leader" key in the endpoint annotations.
115+
_client.return_value.patch.side_effect = _FakeApiError
116+
with self.assertRaises(_FakeApiError):
117+
self.harness.set_leader(False)
118+
self.harness.set_leader()
119+
120+
# Test no failure if the resource doesn't exist.
121+
_client.return_value.patch.side_effect = _FakeApiError(404)
122+
self.harness.set_leader(False)
123+
self.harness.set_leader()
75124

76125
@patch("charm.Patroni.rock_postgresql_version", new_callable=PropertyMock)
77126
@patch("charm.Patroni.primary_endpoint_ready", new_callable=PropertyMock)
@@ -653,28 +702,42 @@ def test_set_secret(self, _, __):
653702
@patch("charm.Client")
654703
def test_on_stop(self, _client):
655704
# Test a successful run of the hook.
656-
with self.assertNoLogs("charm", "ERROR"):
657-
_client.return_value.get.return_value = MagicMock(
658-
metadata=MagicMock(ownerReferences="fakeOwnerReferences")
659-
)
660-
_client.return_value.list.side_effect = [
661-
[MagicMock(metadata=MagicMock(name="fakeName1", namespace="fakeNamespace"))],
662-
[MagicMock(metadata=MagicMock(name="fakeName2", namespace="fakeNamespace"))],
663-
]
664-
self.charm.on.stop.emit()
665-
_client.return_value.get.assert_called_once_with(
666-
res=Pod, name="postgresql-k8s-0", namespace=self.charm.model.name
667-
)
668-
for kind in [Endpoints, Service]:
669-
_client.return_value.list.assert_any_call(
670-
kind,
671-
namespace=self.charm.model.name,
672-
labels={"app.juju.is/created-by": self.charm.app.name},
705+
for planned_units, relation_data in {
706+
0: {},
707+
1: {"some-relation-data": "some-value"},
708+
}.items():
709+
self.harness.set_planned_units(planned_units)
710+
with self.harness.hooks_disabled():
711+
self.harness.update_relation_data(
712+
self.rel_id,
713+
self.charm.unit.name,
714+
{"some-relation-data": "some-value"},
673715
)
674-
self.assertEqual(_client.return_value.apply.call_count, 2)
716+
with self.assertNoLogs("charm", "ERROR"):
717+
_client.return_value.get.return_value = MagicMock(
718+
metadata=MagicMock(ownerReferences="fakeOwnerReferences")
719+
)
720+
_client.return_value.list.side_effect = [
721+
[MagicMock(metadata=MagicMock(name="fakeName1", namespace="fakeNamespace"))],
722+
[MagicMock(metadata=MagicMock(name="fakeName2", namespace="fakeNamespace"))],
723+
]
724+
self.charm.on.stop.emit()
725+
_client.return_value.get.assert_called_once_with(
726+
res=Pod, name="postgresql-k8s-0", namespace=self.charm.model.name
727+
)
728+
for kind in [Endpoints, Service]:
729+
_client.return_value.list.assert_any_call(
730+
kind,
731+
namespace=self.charm.model.name,
732+
labels={"app.juju.is/created-by": self.charm.app.name},
733+
)
734+
self.assertEqual(_client.return_value.apply.call_count, 2)
735+
self.assertEqual(
736+
self.harness.get_relation_data(self.rel_id, self.charm.unit), relation_data
737+
)
738+
_client.reset_mock()
675739

676740
# Test when the charm fails to get first pod info.
677-
_client.reset_mock()
678741
_client.return_value.get.side_effect = _FakeApiError
679742
with self.assertLogs("charm", "ERROR") as logs:
680743
self.charm.on.stop.emit()

0 commit comments

Comments
 (0)