Skip to content

Commit c996dd3

Browse files
author
Lucas Gameiro
authored
[DPE-4427] Address main instability sources on backups integration tests (#496)
* add postgres connection check * add leader check in peer event + remove restart * remove unit test * refactor initialization check * catch RetryError and defer tls event * add primary switchover on scale-down * revert patroni restart change * add sleep to tls check and catch modelError * wait before first backup + better logging on TLS test * add wait for model in TLS test + idle timeout * catch retry error from patroni call + increase retries * revert retry attempt limit change * revert wait model on tls test * update test_charm parameter * refactor on initialize cluster check + make self_healing fail fast * add shared buffers to dynamic config * revert + make wait for TLS in test_backups * clean previous changes * try waiting for idle after relating * reposition wait to avoid charm stuck * bump LIBPATCH and set log to warning level
1 parent 04c217e commit c996dd3

File tree

5 files changed

+113
-13
lines changed

5 files changed

+113
-13
lines changed

lib/charms/postgresql_k8s/v0/postgresql_tls.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
from ops.charm import ActionEvent, RelationBrokenEvent
3535
from ops.framework import Object
3636
from ops.pebble import ConnectionError, PathError, ProtocolError
37+
from tenacity import RetryError
3738

3839
# The unique Charmhub library identifier, never change it
3940
LIBID = "c27af44a92df4ef38d7ae06418b2800f"
@@ -43,7 +44,7 @@
4344

4445
# Increment this PATCH version before using `charmcraft publish-lib` or reset
4546
# to 0 if you are raising the major API version.
46-
LIBPATCH = 8
47+
LIBPATCH = 9
4748

4849
logger = logging.getLogger(__name__)
4950
SCOPE = "unit"
@@ -142,7 +143,7 @@ def _on_certificate_available(self, event: CertificateAvailableEvent) -> None:
142143
logger.debug("Cannot push TLS certificates at this moment")
143144
event.defer()
144145
return
145-
except (ConnectionError, PathError, ProtocolError) as e:
146+
except (ConnectionError, PathError, ProtocolError, RetryError) as e:
146147
logger.error("Cannot push TLS certificates: %r", e)
147148
event.defer()
148149
return

src/charm.py

Lines changed: 88 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -45,14 +45,15 @@
4545
Container,
4646
JujuVersion,
4747
MaintenanceStatus,
48+
ModelError,
4849
Relation,
4950
Unit,
5051
UnknownStatus,
5152
WaitingStatus,
5253
)
5354
from ops.pebble import ChangeError, Layer, PathError, ProtocolError, ServiceStatus
5455
from requests import ConnectionError
55-
from tenacity import RetryError, Retrying, stop_after_attempt, wait_fixed
56+
from tenacity import RetryError, Retrying, stop_after_attempt, stop_after_delay, wait_fixed
5657

5758
from backups import PostgreSQLBackups
5859
from config import CharmConfig
@@ -80,7 +81,7 @@
8081
WORKLOAD_OS_GROUP,
8182
WORKLOAD_OS_USER,
8283
)
83-
from patroni import NotReadyError, Patroni
84+
from patroni import NotReadyError, Patroni, SwitchoverFailedError
8485
from relations.async_replication import PostgreSQLAsyncReplication
8586
from relations.db import EXTENSIONS_BLOCKING_MESSAGE, DbProvides
8687
from relations.postgresql_provider import PostgreSQLProvider
@@ -144,6 +145,7 @@ def __init__(self, *args):
144145
self.framework.observe(self.on.secret_changed, self._on_peer_relation_changed)
145146
self.framework.observe(self.on[PEER].relation_departed, self._on_peer_relation_departed)
146147
self.framework.observe(self.on.postgresql_pebble_ready, self._on_postgresql_pebble_ready)
148+
self.framework.observe(self.on.pgdata_storage_detaching, self._on_pgdata_storage_detaching)
147149
self.framework.observe(self.on.stop, self._on_stop)
148150
self.framework.observe(self.on.upgrade_charm, self._on_upgrade_charm)
149151
self.framework.observe(self.on.get_password_action, self._on_get_password)
@@ -379,14 +381,68 @@ def _on_peer_relation_departed(self, event: RelationDepartedEvent) -> None:
379381
# Update the sync-standby endpoint in the async replication data.
380382
self.async_replication.update_async_replication_data()
381383

382-
def _on_peer_relation_changed(self, event: HookEvent) -> None:
384+
def _on_pgdata_storage_detaching(self, _) -> None:
385+
# Change the primary if it's the unit that is being removed.
386+
try:
387+
primary = self._patroni.get_primary(unit_name_pattern=True)
388+
except RetryError:
389+
# Ignore the event if the primary couldn't be retrieved.
390+
# If a switchover is needed, an automatic failover will be triggered
391+
# when the unit is removed.
392+
logger.debug("Early exit on_pgdata_storage_detaching: primary cannot be retrieved")
393+
return
394+
395+
if self.unit.name != primary:
396+
return
397+
398+
if not self._patroni.are_all_members_ready():
399+
logger.warning(
400+
"could not switchover because not all members are ready"
401+
" - an automatic failover will be triggered"
402+
)
403+
return
404+
405+
# Try to switchover to another member and raise an exception if it doesn't succeed.
406+
# If it doesn't happen on time, Patroni will automatically run a fail-over.
407+
try:
408+
# Get the current primary to check if it has changed later.
409+
current_primary = self._patroni.get_primary()
410+
411+
# Trigger the switchover.
412+
self._patroni.switchover()
413+
414+
# Wait for the switchover to complete.
415+
self._patroni.primary_changed(current_primary)
416+
417+
logger.info("successful switchover")
418+
except (RetryError, SwitchoverFailedError) as e:
419+
logger.warning(
420+
f"switchover failed with reason: {e} - an automatic failover will be triggered"
421+
)
422+
return
423+
424+
# Only update the connection endpoints if there is a primary.
425+
# A cluster can have all members as replicas for some time after
426+
# a failed switchover, so wait until the primary is elected.
427+
endpoints_to_remove = self._get_endpoints_to_remove()
428+
self.postgresql_client_relation.update_read_only_endpoint()
429+
self._remove_from_endpoints(endpoints_to_remove)
430+
431+
def _on_peer_relation_changed(self, event: HookEvent) -> None: # noqa: C901
383432
"""Reconfigure cluster members."""
384433
# The cluster must be initialized first in the leader unit
385434
# before any other member joins the cluster.
386435
if "cluster_initialised" not in self._peers.data[self.app]:
387-
logger.debug(
388-
"Deferring on_peer_relation_changed: Cluster must be initialized before members can join"
389-
)
436+
if self.unit.is_leader():
437+
if self._initialize_cluster(event):
438+
logger.debug("Deferring on_peer_relation_changed: Leader initialized cluster")
439+
else:
440+
logger.debug("_initialized_cluster failed on _peer_relation_changed")
441+
return
442+
else:
443+
logger.debug(
444+
"Deferring on_peer_relation_changed: Cluster must be initialized before members can join"
445+
)
390446
event.defer()
391447
return
392448

@@ -437,7 +493,10 @@ def _on_peer_relation_changed(self, event: HookEvent) -> None:
437493
event.defer()
438494
return
439495

440-
self.postgresql_client_relation.update_read_only_endpoint()
496+
try:
497+
self.postgresql_client_relation.update_read_only_endpoint()
498+
except ModelError as e:
499+
logger.warning("Cannot update read_only endpoints: %s", str(e))
441500

442501
self.backup.coordinate_stanza_fields()
443502

@@ -594,6 +653,9 @@ def _add_members(self, event) -> None:
594653
except NotReadyError:
595654
logger.info("Deferring reconfigure: another member doing sync right now")
596655
event.defer()
656+
except RetryError:
657+
logger.info("Deferring reconfigure: failed to obtain cluster members from Patroni")
658+
event.defer()
597659

598660
def add_cluster_member(self, member: str) -> None:
599661
"""Add member to the cluster if all members are already up and running.
@@ -1432,6 +1494,14 @@ def _restart(self, event: RunWithLock) -> None:
14321494
# Update health check URL.
14331495
self._update_pebble_layers()
14341496

1497+
try:
1498+
for attempt in Retrying(wait=wait_fixed(3), stop=stop_after_delay(300)):
1499+
with attempt:
1500+
if not self._can_connect_to_postgresql:
1501+
assert False
1502+
except Exception:
1503+
logger.exception("Unable to reconnect to postgresql")
1504+
14351505
# Start or stop the pgBackRest TLS server service when TLS certificate change.
14361506
self.backup.start_stop_pgbackrest_service()
14371507

@@ -1448,6 +1518,17 @@ def _is_workload_running(self) -> bool:
14481518

14491519
return services[0].current == ServiceStatus.ACTIVE
14501520

1521+
@property
1522+
def _can_connect_to_postgresql(self) -> bool:
1523+
try:
1524+
for attempt in Retrying(stop=stop_after_delay(30), wait=wait_fixed(3)):
1525+
with attempt:
1526+
assert self.postgresql.get_postgresql_timezones()
1527+
except RetryError:
1528+
logger.debug("Cannot connect to database")
1529+
return False
1530+
return True
1531+
14511532
def update_config(self, is_creating_backup: bool = False) -> bool:
14521533
"""Updates Patroni config file based on the existence of the TLS files."""
14531534
# Retrieve PostgreSQL parameters.

src/patroni.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
RetryError,
1818
Retrying,
1919
retry,
20+
retry_if_result,
2021
stop_after_attempt,
2122
stop_after_delay,
2223
wait_exponential,
@@ -490,3 +491,13 @@ def switchover(self, candidate: str = None) -> None:
490491
new_primary = self.get_primary()
491492
if (candidate is not None and new_primary != candidate) or new_primary == primary:
492493
raise SwitchoverFailedError("primary was not switched correctly")
494+
495+
@retry(
496+
retry=retry_if_result(lambda x: not x),
497+
stop=stop_after_attempt(10),
498+
wait=wait_exponential(multiplier=1, min=2, max=30),
499+
)
500+
def primary_changed(self, old_primary: str) -> bool:
501+
"""Checks whether the primary unit has changed."""
502+
primary = self.get_primary()
503+
return primary != old_primary

tests/integration/test_backups.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -109,8 +109,13 @@ async def test_backup_and_restore(ops_test: OpsTest, cloud_configs: Tuple[Dict,
109109
await build_and_deploy(
110110
ops_test, 2, database_app_name=database_app_name, wait_for_idle=False
111111
)
112-
await ops_test.model.relate(database_app_name, S3_INTEGRATOR_APP_NAME)
112+
113113
await ops_test.model.relate(database_app_name, TLS_CERTIFICATES_APP_NAME)
114+
async with ops_test.fast_forward(fast_interval="60s"):
115+
await ops_test.model.wait_for_idle(
116+
apps=[database_app_name], status="active", timeout=1000
117+
)
118+
await ops_test.model.relate(database_app_name, S3_INTEGRATOR_APP_NAME)
114119

115120
# Configure and set access and secret keys.
116121
logger.info(f"configuring S3 integrator for {cloud}")
@@ -142,7 +147,9 @@ async def test_backup_and_restore(ops_test: OpsTest, cloud_configs: Tuple[Dict,
142147
)
143148
connection.close()
144149

145-
# Run the "create backup" action.
150+
# With a stable cluster, Run the "create backup" action
151+
async with ops_test.fast_forward():
152+
await ops_test.model.wait_for_idle(status="active", timeout=1000, idle_period=30)
146153
logger.info("creating a backup")
147154
action = await ops_test.model.units.get(replica).run_action("create-backup")
148155
await action.wait()

tests/integration/test_charm.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,7 @@ async def test_postgresql_parameters_change(ops_test: OpsTest) -> None:
174174
"""Test that's possible to change PostgreSQL parameters."""
175175
await ops_test.model.applications[APP_NAME].set_config({
176176
"memory_max_prepared_transactions": "100",
177-
"memory_shared_buffers": "128",
177+
"memory_shared_buffers": "32768", # 2 * 128MB. Patroni may refuse the config if < 128MB
178178
"response_lc_monetary": "en_GB.utf8",
179179
"experimental_max_connections": "200",
180180
})
@@ -206,7 +206,7 @@ async def test_postgresql_parameters_change(ops_test: OpsTest) -> None:
206206

207207
# Validate each configuration set by Patroni on PostgreSQL.
208208
assert settings["max_prepared_transactions"] == "100"
209-
assert settings["shared_buffers"] == "128"
209+
assert settings["shared_buffers"] == "32768"
210210
assert settings["lc_monetary"] == "en_GB.utf8"
211211
assert settings["max_connections"] == "200"
212212
finally:

0 commit comments

Comments
 (0)