Skip to content

Commit 1e9eb89

Browse files
authored
Remove reinits (#999)
1 parent de0f011 commit 1e9eb89

File tree

2 files changed

+3
-76
lines changed

2 files changed

+3
-76
lines changed

src/charm.py

Lines changed: 2 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -913,23 +913,6 @@ def _on_peer_relation_changed(self, event: HookEvent):
913913
event.defer()
914914
return
915915

916-
# Restart the workload if it's stuck on the starting state after a timeline divergence
917-
# due to a backup that was restored.
918-
if (
919-
not self.is_primary
920-
and not self.is_standby_leader
921-
and (
922-
self._patroni.member_replication_lag == "unknown"
923-
or int(self._patroni.member_replication_lag) > 1000
924-
)
925-
):
926-
logger.warning("Degraded member detected: reinitialising unit")
927-
self.set_unit_status(MaintenanceStatus("reinitialising replica"))
928-
self._patroni.reinitialize_postgresql()
929-
logger.debug("Deferring on_peer_relation_changed: reinitialising replica")
930-
event.defer()
931-
return
932-
933916
self._start_stop_pgbackrest_service(event)
934917

935918
# This is intended to be executed only when leader is reinitializing S3 connection due to the leader change.
@@ -1933,7 +1916,8 @@ def _on_update_status(self, _) -> None:
19331916
if self.primary_endpoint:
19341917
self._update_relation_endpoints()
19351918

1936-
if self._handle_workload_failures():
1919+
if not self._patroni.member_started and self._patroni.is_member_isolated:
1920+
self._patroni.restart_patroni()
19371921
return
19381922

19391923
# Update the sync-standby endpoint in the async replication data.
@@ -2054,40 +2038,6 @@ def _handle_processes_failures(self) -> bool:
20542038

20552039
return False
20562040

2057-
def _handle_workload_failures(self) -> bool:
2058-
"""Handle workload (Patroni or PostgreSQL) failures.
2059-
2060-
Returns:
2061-
a bool indicating whether the charm performed any action.
2062-
"""
2063-
# Restart the workload if it's stuck on the starting state after a restart.
2064-
try:
2065-
is_primary = self.is_primary
2066-
is_standby_leader = self.is_standby_leader
2067-
except RetryError:
2068-
return False
2069-
2070-
if (
2071-
not self.has_raft_keys()
2072-
and not is_primary
2073-
and not is_standby_leader
2074-
and not self._patroni.member_started
2075-
and "postgresql_restarted" in self._peers.data[self.unit]
2076-
and self._patroni.member_replication_lag == "unknown"
2077-
):
2078-
logger.warning("Workload failure detected. Reinitialising unit.")
2079-
self.set_unit_status(MaintenanceStatus("reinitialising replica"))
2080-
self._patroni.reinitialize_postgresql()
2081-
return True
2082-
2083-
# Restart the service if the current cluster member is isolated from the cluster
2084-
# (stuck with the "awaiting for member to start" message).
2085-
if not self._patroni.member_started and self._patroni.is_member_isolated:
2086-
self._patroni.restart_patroni()
2087-
return True
2088-
2089-
return False
2090-
20912041
def _set_primary_status_message(self) -> None:
20922042
"""Display 'Primary' in the unit status message if the current unit is the primary."""
20932043
try:

tests/unit/test_charm.py

Lines changed: 1 addition & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -803,7 +803,6 @@ def test_on_update_status(harness):
803803
) as _set_primary_status_message,
804804
patch("charm.Patroni.restart_patroni") as _restart_patroni,
805805
patch("charm.Patroni.is_member_isolated") as _is_member_isolated,
806-
patch("charm.Patroni.reinitialize_postgresql") as _reinitialize_postgresql,
807806
patch(
808807
"charm.Patroni.member_replication_lag", new_callable=PropertyMock
809808
) as _member_replication_lag,
@@ -874,24 +873,9 @@ def test_on_update_status(harness):
874873
harness.charm.on.update_status.emit()
875874
_set_primary_status_message.assert_called_once()
876875

877-
# Test the reinitialisation of the replica when its lag is unknown
878-
# after a restart.
876+
# Test call to restart when the member is isolated from the cluster.
879877
_set_primary_status_message.reset_mock()
880-
_is_primary.return_value = False
881-
_is_standby_leader.return_value = False
882878
_member_started.return_value = False
883-
_is_member_isolated.return_value = False
884-
_member_replication_lag.return_value = "unknown"
885-
with harness.hooks_disabled():
886-
harness.update_relation_data(
887-
rel_id, harness.charm.unit.name, {"postgresql_restarted": "True"}
888-
)
889-
harness.charm.on.update_status.emit()
890-
_reinitialize_postgresql.assert_called_once()
891-
_restart_patroni.assert_not_called()
892-
_set_primary_status_message.assert_not_called()
893-
894-
# Test call to restart when the member is isolated from the cluster.
895879
_is_member_isolated.return_value = True
896880
with harness.hooks_disabled():
897881
harness.update_relation_data(
@@ -908,9 +892,6 @@ def test_on_update_status_after_restore_operation(harness):
908892
patch(
909893
"charm.PostgresqlOperatorCharm._set_primary_status_message"
910894
) as _set_primary_status_message,
911-
patch(
912-
"charm.PostgresqlOperatorCharm._handle_workload_failures"
913-
) as _handle_workload_failures,
914895
patch(
915896
"charm.PostgresqlOperatorCharm._update_relation_endpoints"
916897
) as _update_relation_endpoints,
@@ -946,7 +927,6 @@ def test_on_update_status_after_restore_operation(harness):
946927
_handle_processes_failures.assert_not_called()
947928
_oversee_users.assert_not_called()
948929
_update_relation_endpoints.assert_not_called()
949-
_handle_workload_failures.assert_not_called()
950930
_set_primary_status_message.assert_not_called()
951931
assert isinstance(harness.charm.unit.status, BlockedStatus)
952932

@@ -959,7 +939,6 @@ def test_on_update_status_after_restore_operation(harness):
959939
_handle_processes_failures.assert_not_called()
960940
_oversee_users.assert_not_called()
961941
_update_relation_endpoints.assert_not_called()
962-
_handle_workload_failures.assert_not_called()
963942
_set_primary_status_message.assert_not_called()
964943
assert isinstance(harness.charm.unit.status, ActiveStatus)
965944

@@ -973,13 +952,11 @@ def test_on_update_status_after_restore_operation(harness):
973952
_member_started.return_value = True
974953
_can_use_s3_repository.return_value = (True, None)
975954
_handle_processes_failures.return_value = False
976-
_handle_workload_failures.return_value = False
977955
harness.charm.on.update_status.emit()
978956
_update_config.assert_called_once()
979957
_handle_processes_failures.assert_called_once()
980958
_oversee_users.assert_called_once()
981959
_update_relation_endpoints.assert_called_once()
982-
_handle_workload_failures.assert_called_once()
983960
_set_primary_status_message.assert_called_once()
984961
assert isinstance(harness.charm.unit.status, ActiveStatus)
985962

0 commit comments

Comments
 (0)