Skip to content

Commit 36e5fa4

Browse files
[DPE-3591] Fix shared buffers validation (#361)
* Fix shared buffers validation Signed-off-by: Marcelo Henrique Neppel <[email protected]> * Consider shared buffers config option value when calculating effective cache size value Signed-off-by: Marcelo Henrique Neppel <[email protected]> * Fix PostgreSQL restart not being called Signed-off-by: Marcelo Henrique Neppel <[email protected]> * Add and update unit test Signed-off-by: Marcelo Henrique Neppel <[email protected]> * Update unit test Signed-off-by: Marcelo Henrique Neppel <[email protected]> * Increase test timeout Signed-off-by: Marcelo Henrique Neppel <[email protected]> * Increase test timeout Signed-off-by: Marcelo Henrique Neppel <[email protected]> * Switch to GH runners * Increase stanza timeout * [MISC] Check for peer relation when getting the primary endpoint (#366) * Reenable landscape tests * Try GH runners * Back on self-hosted runner * Don't check for primary if peer didn't join yet * Don't recheck missing primary ip when looking for primary endpoint * Catch list user exception * Catch Psycopg2 error when trying to set config * Try to preflight connection on config validation * Don't retry db connection check * Set the primary endpoint before starting the primary * Revert "Set the primary endpoint before starting the primary" This reverts commit 8184dfa. * Wait for peer before starting primary * Switch to GH runners * Recheck DB connection * Don't check peer on primary startup * Try to skip primary IP validation if not all cluster IPs seem to be available * Comment * Review comments * Unit tests --------- Signed-off-by: Marcelo Henrique Neppel <[email protected]> Co-authored-by: Dragomir Penev <[email protected]> Co-authored-by: Dragomir Penev <[email protected]>
1 parent 09dd653 commit 36e5fa4

17 files changed

+166
-63
lines changed

poetry.lock

Lines changed: 0 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/charm.py

Lines changed: 64 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
import os
99
import platform
1010
import subprocess
11-
import time
1211
from typing import Dict, List, Literal, Optional, Set, get_args
1312

1413
from charms.data_platform_libs.v0.data_interfaces import DataPeer, DataPeerUnit
@@ -20,6 +19,7 @@
2019
PostgreSQL,
2120
PostgreSQLCreateUserError,
2221
PostgreSQLEnableDisableExtensionError,
22+
PostgreSQLListUsersError,
2323
PostgreSQLUpdateUserPasswordError,
2424
)
2525
from charms.postgresql_k8s.v0.postgresql_tls import PostgreSQLTLS
@@ -44,7 +44,7 @@
4444
Unit,
4545
WaitingStatus,
4646
)
47-
from tenacity import RetryError, Retrying, retry, stop_after_delay, wait_fixed
47+
from tenacity import RetryError, Retrying, retry, stop_after_attempt, stop_after_delay, wait_fixed
4848

4949
from backups import PostgreSQLBackups
5050
from cluster import (
@@ -300,6 +300,9 @@ def postgresql(self) -> PostgreSQL:
300300
@property
301301
def primary_endpoint(self) -> Optional[str]:
302302
"""Returns the endpoint of the primary instance or None when no primary available."""
303+
if not self._peers:
304+
logger.debug("primary endpoint early exit: Peer relation not joined yet.")
305+
return None
303306
try:
304307
for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(3)):
305308
with attempt:
@@ -309,7 +312,20 @@ def primary_endpoint(self) -> Optional[str]:
309312
# returned is not in the list of the current cluster members
310313
# (like when the cluster was not updated yet after a failed switchover).
311314
if not primary_endpoint or primary_endpoint not in self._units_ips:
312-
raise ValueError()
315+
# TODO figure out why peer data is not available
316+
if (
317+
primary_endpoint
318+
and len(self._units_ips) == 1
319+
and len(self._peers.units) > 1
320+
):
321+
logger.warning(
322+
"Possibly incoplete peer data: Will not map primary IP to unit IP"
323+
)
324+
return primary_endpoint
325+
logger.debug(
326+
"primary endpoint early exit: Primary IP not in cached peer list."
327+
)
328+
primary_endpoint = None
313329
except RetryError:
314330
return None
315331
else:
@@ -1037,6 +1053,10 @@ def _start_primary(self, event: StartEvent) -> None:
10371053
logger.exception(e)
10381054
self.unit.status = BlockedStatus("Failed to create postgres user")
10391055
return
1056+
except PostgreSQLListUsersError:
1057+
logger.warning("Deferriing on_start: Unable to list users")
1058+
event.defer()
1059+
return
10401060

10411061
self.postgresql.set_up_database()
10421062

@@ -1382,6 +1402,17 @@ def _is_workload_running(self) -> bool:
13821402

13831403
return charmed_postgresql_snap.services["patroni"]["active"]
13841404

1405+
@property
1406+
def _can_connect_to_postgresql(self) -> bool:
1407+
try:
1408+
for attempt in Retrying(stop=stop_after_delay(30), wait=wait_fixed(3)):
1409+
with attempt:
1410+
assert self.postgresql.get_postgresql_timezones()
1411+
except RetryError:
1412+
logger.debug("Cannot connect to database")
1413+
return False
1414+
return True
1415+
13851416
def update_config(self, is_creating_backup: bool = False) -> bool:
13861417
"""Updates Patroni config file based on the existence of the TLS files."""
13871418
if (
@@ -1425,6 +1456,10 @@ def update_config(self, is_creating_backup: bool = False) -> bool:
14251456
logger.debug("Early exit update_config: Patroni not started yet")
14261457
return False
14271458

1459+
# Try to connect
1460+
if not self._can_connect_to_postgresql:
1461+
logger.warning("Early exit update_config: Cannot connect to Postgresql")
1462+
return False
14281463
self._validate_config_options()
14291464

14301465
self._patroni.bulk_update_parameters_controller_by_patroni(
@@ -1434,20 +1469,7 @@ def update_config(self, is_creating_backup: bool = False) -> bool:
14341469
}
14351470
)
14361471

1437-
restart_postgresql = self.is_tls_enabled != self.postgresql.is_tls_enabled()
1438-
self._patroni.reload_patroni_configuration()
1439-
# Sleep the same time as Patroni's loop_wait default value, which tells how much time
1440-
# Patroni will wait before checking the configuration file again to reload it.
1441-
time.sleep(10)
1442-
restart_postgresql = restart_postgresql or self.postgresql.is_restart_pending()
1443-
self.unit_peer_data.update({"tls": "enabled" if enable_tls else ""})
1444-
1445-
# Restart PostgreSQL if TLS configuration has changed
1446-
# (so the both old and new connections use the configuration).
1447-
if restart_postgresql:
1448-
logger.info("PostgreSQL restart required")
1449-
self._peers.data[self.unit].pop("postgresql_restarted", None)
1450-
self.on[self.restart_manager.name].acquire_lock.emit()
1472+
self._handle_postgresql_restart_need(enable_tls)
14511473

14521474
# Restart the monitoring service if the password was rotated
14531475
cache = snap.SnapCache()
@@ -1487,6 +1509,31 @@ def _validate_config_options(self) -> None:
14871509
):
14881510
raise Exception("request_time_zone config option has an invalid value")
14891511

1512+
def _handle_postgresql_restart_need(self, enable_tls: bool) -> None:
1513+
"""Handle PostgreSQL restart need based on the TLS configuration and configuration changes."""
1514+
restart_postgresql = self.is_tls_enabled != self.postgresql.is_tls_enabled()
1515+
self._patroni.reload_patroni_configuration()
1516+
# Wait for some more time than the Patroni's loop_wait default value (10 seconds),
1517+
# which tells how much time Patroni will wait before checking the configuration
1518+
# file again to reload it.
1519+
try:
1520+
for attempt in Retrying(stop=stop_after_attempt(5), wait=wait_fixed(3)):
1521+
with attempt:
1522+
restart_postgresql = restart_postgresql or self.postgresql.is_restart_pending()
1523+
if not restart_postgresql:
1524+
raise Exception
1525+
except RetryError:
1526+
# Ignore the error, as it happens only to indicate that the configuration has not changed.
1527+
pass
1528+
self.unit_peer_data.update({"tls": "enabled" if enable_tls else ""})
1529+
1530+
# Restart PostgreSQL if TLS configuration has changed
1531+
# (so the both old and new connections use the configuration).
1532+
if restart_postgresql:
1533+
logger.info("PostgreSQL restart required")
1534+
self._peers.data[self.unit].pop("postgresql_restarted", None)
1535+
self.on[self.restart_manager.name].acquire_lock.emit()
1536+
14901537
def _update_relation_endpoints(self) -> None:
14911538
"""Updates endpoints and read-only endpoint in all relations."""
14921539
self.postgresql_client_relation.update_endpoints()

tests/integration/ha_tests/helpers.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -819,7 +819,7 @@ async def add_unit_with_storage(ops_test, app, storage):
819819
return_code, _, _ = await ops_test.juju(*add_unit_cmd)
820820
assert return_code == 0, "Failed to add unit with storage"
821821
async with ops_test.fast_forward():
822-
await ops_test.model.wait_for_idle(apps=[app], status="active", timeout=1500)
822+
await ops_test.model.wait_for_idle(apps=[app], status="active", timeout=2000)
823823
assert (
824824
len(ops_test.model.applications[app].units) == expected_units
825825
), "New unit not added to model"

tests/integration/ha_tests/test_replication.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
)
1919

2020

21-
@pytest.mark.runner(["self-hosted", "linux", "X64", "jammy", "large"])
2221
@pytest.mark.group(1)
2322
@pytest.mark.abort_on_fail
2423
async def test_build_and_deploy(ops_test: OpsTest) -> None:

tests/integration/ha_tests/test_restore_cluster.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@
2929
charm = None
3030

3131

32-
@pytest.mark.runner(["self-hosted", "linux", "X64", "jammy", "large"])
3332
@pytest.mark.group(1)
3433
@pytest.mark.abort_on_fail
3534
async def test_build_and_deploy(ops_test: OpsTest) -> None:

tests/integration/ha_tests/test_self_healing.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,6 @@
6060
DB_PROCESSES = [POSTGRESQL_PROCESS, PATRONI_PROCESS]
6161

6262

63-
@pytest.mark.runner(["self-hosted", "linux", "X64", "jammy", "large"])
6463
@pytest.mark.group(1)
6564
@pytest.mark.abort_on_fail
6665
async def test_build_and_deploy(ops_test: OpsTest) -> None:

tests/integration/ha_tests/test_upgrade.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@
3030
TIMEOUT = 600
3131

3232

33-
@pytest.mark.runner(["self-hosted", "linux", "X64", "jammy", "large"])
3433
@pytest.mark.group(1)
3534
@pytest.mark.abort_on_fail
3635
async def test_deploy_latest(ops_test: OpsTest) -> None:

tests/integration/ha_tests/test_upgrade_from_stable.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@
2525
TIMEOUT = 600
2626

2727

28-
@pytest.mark.runner(["self-hosted", "linux", "X64", "jammy", "large"])
2928
@pytest.mark.group(1)
3029
@pytest.mark.abort_on_fail
3130
async def test_deploy_stable(ops_test: OpsTest) -> None:

tests/integration/new_relations/test_new_relations.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,6 @@
3636
INVALID_EXTRA_USER_ROLE_BLOCKING_MESSAGE = "invalid role(s) for extra user roles"
3737

3838

39-
@pytest.mark.runner(["self-hosted", "linux", "X64", "jammy", "large"])
4039
@pytest.mark.group(1)
4140
@pytest.mark.abort_on_fail
4241
async def test_deploy_charms(ops_test: OpsTest, charm):

tests/integration/test_backups.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,6 @@ async def cloud_configs(ops_test: OpsTest, github_secrets) -> None:
9090
bucket_object.delete()
9191

9292

93-
@pytest.mark.runner(["self-hosted", "linux", "X64", "jammy", "large"])
9493
@pytest.mark.group(1)
9594
async def test_none() -> None:
9695
"""Empty test so that the suite will not fail if all tests are skippedi."""
@@ -133,7 +132,7 @@ async def test_backup(ops_test: OpsTest, cloud_configs: Tuple[Dict, Dict]) -> No
133132
await action.wait()
134133
async with ops_test.fast_forward(fast_interval="60s"):
135134
await ops_test.model.wait_for_idle(
136-
apps=[database_app_name, S3_INTEGRATOR_APP_NAME], status="active", timeout=1000
135+
apps=[database_app_name, S3_INTEGRATOR_APP_NAME], status="active", timeout=1200
137136
)
138137

139138
primary = await get_primary(ops_test, f"{database_app_name}/0")

0 commit comments

Comments
 (0)