Skip to content

Commit 64b65b4

Browse files
[DPE-3257] Fix network cut tests (#346)
* Fix network cut test Signed-off-by: Marcelo Henrique Neppel <[email protected]> * Fix network cut test without IP change Signed-off-by: Marcelo Henrique Neppel <[email protected]> * Update unit test Signed-off-by: Marcelo Henrique Neppel <[email protected]> * Fix retrieval of units IPs Signed-off-by: Marcelo Henrique Neppel <[email protected]> * Improve checks for readiness Signed-off-by: Marcelo Henrique Neppel <[email protected]> * Fix IP retrieval Signed-off-by: Marcelo Henrique Neppel <[email protected]> * Fix IP retrieval Signed-off-by: Marcelo Henrique Neppel <[email protected]> * Fix IP retrieval Signed-off-by: Marcelo Henrique Neppel <[email protected]> --------- Signed-off-by: Marcelo Henrique Neppel <[email protected]>
1 parent a87c52b commit 64b65b4

File tree

4 files changed

+140
-48
lines changed

4 files changed

+140
-48
lines changed

src/charm.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -530,12 +530,14 @@ def _reconfigure_cluster(self, event: HookEvent):
530530
and event.relation.data[event.unit].get("ip-to-remove") is not None
531531
):
532532
ip_to_remove = event.relation.data[event.unit].get("ip-to-remove")
533+
logger.info("Removing %s from the cluster due to IP change", ip_to_remove)
533534
try:
534535
self._patroni.remove_raft_member(ip_to_remove)
535536
except RemoveRaftMemberFailedError:
536537
logger.debug("Deferring on_peer_relation_changed: failed to remove raft member")
537538
return False
538-
self._remove_from_members_ips(ip_to_remove)
539+
if ip_to_remove in self.members_ips:
540+
self._remove_from_members_ips(ip_to_remove)
539541
self._add_members(event)
540542
return True
541543

@@ -818,6 +820,7 @@ def _on_leader_elected(self, event: LeaderElectedEvent) -> None:
818820

819821
# Remove departing units when the leader changes.
820822
for ip in self._get_ips_to_remove():
823+
logger.info("Removing %s from the cluster", ip)
821824
self._remove_from_members_ips(ip)
822825

823826
self.update_config()

tests/integration/ha_tests/helpers.py

Lines changed: 101 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# Copyright 2022 Canonical Ltd.
22
# See LICENSE file for licensing details.
3+
import logging
34
import os
45
import random
56
import subprocess
@@ -22,6 +23,8 @@
2223

2324
from ..helpers import APPLICATION_NAME, db_connect, get_unit_address, run_command_on_unit
2425

26+
logger = logging.getLogger(__name__)
27+
2528
METADATA = yaml.safe_load(Path("./metadata.yaml").read_text())
2629
PORT = 5432
2730
APP_NAME = METADATA["name"]
@@ -74,13 +77,19 @@ async def are_all_db_processes_down(ops_test: OpsTest, process: str) -> bool:
7477
return True
7578

7679

77-
async def are_writes_increasing(ops_test, down_unit: str = None) -> None:
80+
async def are_writes_increasing(
81+
ops_test, down_unit: str = None, use_ip_from_inside: bool = False
82+
) -> None:
7883
"""Verify new writes are continuing by counting the number of writes."""
79-
writes, _ = await count_writes(ops_test, down_unit=down_unit)
84+
writes, _ = await count_writes(
85+
ops_test, down_unit=down_unit, use_ip_from_inside=use_ip_from_inside
86+
)
8087
for member, count in writes.items():
8188
for attempt in Retrying(stop=stop_after_delay(60 * 3), wait=wait_fixed(3)):
8289
with attempt:
83-
more_writes, _ = await count_writes(ops_test, down_unit=down_unit)
90+
more_writes, _ = await count_writes(
91+
ops_test, down_unit=down_unit, use_ip_from_inside=use_ip_from_inside
92+
)
8493
assert more_writes[member] > count, f"{member}: writes not continuing to DB"
8594

8695

@@ -161,33 +170,46 @@ async def change_wal_settings(
161170
)
162171

163172

164-
async def is_cluster_updated(ops_test: OpsTest, primary_name: str) -> None:
173+
async def is_cluster_updated(
174+
ops_test: OpsTest, primary_name: str, use_ip_from_inside: bool = False
175+
) -> None:
165176
# Verify that the old primary is now a replica.
177+
logger.info("checking that the former primary is now a replica")
166178
assert await is_replica(
167-
ops_test, primary_name
179+
ops_test, primary_name, use_ip_from_inside
168180
), "there are more than one primary in the cluster."
169181

170182
# Verify that all units are part of the same cluster.
171-
member_ips = await fetch_cluster_members(ops_test)
183+
logger.info("checking that all units are part of the same cluster")
184+
member_ips = await fetch_cluster_members(ops_test, use_ip_from_inside)
172185
app = primary_name.split("/")[0]
173186
ip_addresses = [
174-
await get_unit_ip(ops_test, unit.name) for unit in ops_test.model.applications[app].units
187+
await (
188+
get_ip_from_inside_the_unit(ops_test, unit.name)
189+
if use_ip_from_inside
190+
else get_unit_ip(ops_test, unit.name)
191+
)
192+
for unit in ops_test.model.applications[app].units
175193
]
176194
assert set(member_ips) == set(ip_addresses), "not all units are part of the same cluster."
177195

178196
# Verify that no writes to the database were missed after stopping the writes.
179-
total_expected_writes = await check_writes(ops_test)
197+
logger.info("checking that no writes to the database were missed after stopping the writes")
198+
total_expected_writes = await check_writes(ops_test, use_ip_from_inside)
180199

181200
# Verify that old primary is up-to-date.
201+
logger.info("checking that the former primary is up to date with the cluster after restarting")
182202
assert await is_secondary_up_to_date(
183-
ops_test, primary_name, total_expected_writes
203+
ops_test, primary_name, total_expected_writes, use_ip_from_inside
184204
), "secondary not up to date with the cluster after restarting."
185205

186206

187-
async def check_writes(ops_test) -> int:
207+
async def check_writes(ops_test, use_ip_from_inside: bool = False) -> int:
188208
"""Gets the total writes from the test charm and compares to the writes from db."""
189209
total_expected_writes = await stop_continuous_writes(ops_test)
190-
actual_writes, max_number_written = await count_writes(ops_test)
210+
actual_writes, max_number_written = await count_writes(
211+
ops_test, use_ip_from_inside=use_ip_from_inside
212+
)
191213
for member, count in actual_writes.items():
192214
assert (
193215
count == max_number_written[member]
@@ -197,14 +219,20 @@ async def check_writes(ops_test) -> int:
197219

198220

199221
async def count_writes(
200-
ops_test: OpsTest, down_unit: str = None
222+
ops_test: OpsTest, down_unit: str = None, use_ip_from_inside: bool = False
201223
) -> Tuple[Dict[str, int], Dict[str, int]]:
202224
"""Count the number of writes in the database."""
203225
app = await app_name(ops_test)
204226
password = await get_password(ops_test, app, down_unit)
205227
for unit in ops_test.model.applications[app].units:
206228
if unit.name != down_unit:
207-
cluster = get_patroni_cluster(await get_unit_ip(ops_test, unit.name))
229+
cluster = get_patroni_cluster(
230+
await (
231+
get_ip_from_inside_the_unit(ops_test, unit.name)
232+
if use_ip_from_inside
233+
else get_unit_ip(ops_test, unit.name)
234+
)
235+
)
208236
break
209237
down_ips = []
210238
if down_unit:
@@ -263,16 +291,21 @@ def cut_network_from_unit_without_ip_change(machine_name: str) -> None:
263291
subprocess.check_call(limit_set_command.split())
264292

265293

266-
async def fetch_cluster_members(ops_test: OpsTest):
294+
async def fetch_cluster_members(ops_test: OpsTest, use_ip_from_inside: bool = False):
267295
"""Fetches the IPs listed by Patroni as cluster members.
268296
269297
Args:
270298
ops_test: OpsTest instance.
299+
use_ip_from_inside: whether to use the IP from inside the unit.
271300
"""
272301
app = await app_name(ops_test)
273302
member_ips = {}
274303
for unit in ops_test.model.applications[app].units:
275-
unit_ip = await get_unit_ip(ops_test, unit.name)
304+
unit_ip = await (
305+
get_ip_from_inside_the_unit(ops_test, unit.name)
306+
if use_ip_from_inside
307+
else get_unit_ip(ops_test, unit.name)
308+
)
276309
cluster_info = requests.get(f"http://{unit_ip}:8008/cluster")
277310
if len(member_ips) > 0:
278311
# If the list of members IPs was already fetched, also compare the
@@ -304,6 +337,16 @@ async def get_controller_machine(ops_test: OpsTest) -> str:
304337
][0]
305338

306339

340+
async def get_ip_from_inside_the_unit(ops_test: OpsTest, unit_name: str) -> str:
341+
command = f"exec --unit {unit_name} -- hostname -I"
342+
return_code, stdout, stderr = await ops_test.juju(*command.split())
343+
if return_code != 0:
344+
raise ProcessError(
345+
"Expected command %s to succeed instead it failed: %s %s", command, return_code, stderr
346+
)
347+
return stdout.splitlines()[0].strip()
348+
349+
307350
async def get_patroni_setting(ops_test: OpsTest, setting: str) -> Optional[int]:
308351
"""Get the value of one of the integer Patroni settings.
309352
@@ -388,20 +431,28 @@ async def get_unit_ip(ops_test: OpsTest, unit_name: str) -> str:
388431

389432

390433
@retry(stop=stop_after_attempt(8), wait=wait_fixed(15), reraise=True)
391-
async def is_connection_possible(ops_test: OpsTest, unit_name: str) -> bool:
434+
async def is_connection_possible(
435+
ops_test: OpsTest, unit_name: str, use_ip_from_inside: bool = False
436+
) -> bool:
392437
"""Test a connection to a PostgreSQL server."""
393438
app = unit_name.split("/")[0]
394439
password = await get_password(ops_test, app, unit_name)
395-
address = await get_unit_ip(ops_test, unit_name)
440+
address = await (
441+
get_ip_from_inside_the_unit(ops_test, unit_name)
442+
if use_ip_from_inside
443+
else get_unit_ip(ops_test, unit_name)
444+
)
396445
try:
397-
with db_connect(
398-
host=address, password=password
399-
) as connection, connection.cursor() as cursor:
400-
cursor.execute("SELECT 1;")
401-
success = cursor.fetchone()[0] == 1
402-
connection.close()
403-
return success
404-
except psycopg2.Error:
446+
for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(3)):
447+
with attempt:
448+
with db_connect(
449+
host=address, password=password
450+
) as connection, connection.cursor() as cursor:
451+
cursor.execute("SELECT 1;")
452+
success = cursor.fetchone()[0] == 1
453+
connection.close()
454+
return success
455+
except (psycopg2.Error, RetryError):
405456
# Error raised when the connection is not possible.
406457
return False
407458

@@ -420,9 +471,13 @@ def is_machine_reachable_from(origin_machine: str, target_machine: str) -> bool:
420471
return False
421472

422473

423-
async def is_replica(ops_test: OpsTest, unit_name: str) -> bool:
474+
async def is_replica(ops_test: OpsTest, unit_name: str, use_ip_from_inside: bool = False) -> bool:
424475
"""Returns whether the unit a replica in the cluster."""
425-
unit_ip = await get_unit_ip(ops_test, unit_name)
476+
unit_ip = await (
477+
get_ip_from_inside_the_unit(ops_test, unit_name)
478+
if use_ip_from_inside
479+
else get_unit_ip(ops_test, unit_name)
480+
)
426481
member_name = unit_name.replace("/", "-")
427482

428483
try:
@@ -532,9 +587,13 @@ async def send_signal_to_process(
532587
)
533588

534589

535-
async def is_postgresql_ready(ops_test, unit_name: str) -> bool:
590+
async def is_postgresql_ready(ops_test, unit_name: str, use_ip_from_inside: bool = False) -> bool:
536591
"""Verifies a PostgreSQL instance is running and available."""
537-
unit_ip = get_unit_address(ops_test, unit_name)
592+
unit_ip = (
593+
(await get_ip_from_inside_the_unit(ops_test, unit_name))
594+
if use_ip_from_inside
595+
else get_unit_address(ops_test, unit_name)
596+
)
538597
try:
539598
for attempt in Retrying(stop=stop_after_delay(60 * 5), wait=wait_fixed(3)):
540599
with attempt:
@@ -571,15 +630,21 @@ def restore_network_for_unit_without_ip_change(machine_name: str) -> None:
571630
subprocess.check_call(limit_set_command.split())
572631

573632

574-
async def is_secondary_up_to_date(ops_test: OpsTest, unit_name: str, expected_writes: int) -> bool:
633+
async def is_secondary_up_to_date(
634+
ops_test: OpsTest, unit_name: str, expected_writes: int, use_ip_from_inside: bool = False
635+
) -> bool:
575636
"""Checks if secondary is up-to-date with the cluster.
576637
577638
Retries over the period of one minute to give secondary adequate time to copy over data.
578639
"""
579640
app = await app_name(ops_test)
580641
password = await get_password(ops_test, app)
581642
host = [
582-
await get_unit_ip(ops_test, unit.name)
643+
await (
644+
get_ip_from_inside_the_unit(ops_test, unit.name)
645+
if use_ip_from_inside
646+
else get_unit_ip(ops_test, unit.name)
647+
)
583648
for unit in ops_test.model.applications[app].units
584649
if unit.name == unit_name
585650
][0]
@@ -679,15 +744,17 @@ async def update_restart_condition(ops_test: OpsTest, unit, condition: str):
679744

680745

681746
@retry(stop=stop_after_attempt(20), wait=wait_fixed(30))
682-
async def wait_network_restore(ops_test: OpsTest, hostname: str, old_ip: str) -> None:
747+
async def wait_network_restore(ops_test: OpsTest, unit_name: str, old_ip: str) -> None:
683748
"""Wait until network is restored.
684749
685750
Args:
686751
ops_test: pytest plugin helper
687-
hostname: The name of the instance
752+
unit_name: name of the unit
688753
old_ip: old registered IP address
689754
"""
690-
if await instance_ip(ops_test, hostname) == old_ip:
755+
# Retrieve the unit IP from inside the unit because it may not be updated in the
756+
# Juju status too quickly.
757+
if (await get_ip_from_inside_the_unit(ops_test, unit_name)) == old_ip:
691758
raise Exception
692759

693760

tests/integration/ha_tests/test_self_healing.py

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -383,7 +383,6 @@ async def test_forceful_restart_without_data_and_transaction_logs(
383383

384384

385385
@pytest.mark.group(1)
386-
@pytest.mark.unstable
387386
async def test_network_cut(ops_test: OpsTest, continuous_writes, primary_start_timeout):
388387
"""Completely cut and restore network."""
389388
# Locate primary unit.
@@ -456,19 +455,22 @@ async def test_network_cut(ops_test: OpsTest, continuous_writes, primary_start_t
456455

457456
# Wait the LXD unit has its IP updated.
458457
logger.info("waiting for IP address to be updated on Juju unit")
459-
await wait_network_restore(ops_test, primary_hostname, primary_ip)
458+
await wait_network_restore(ops_test, primary_name, primary_ip)
459+
460+
# Verify that the database service got restarted and is ready in the old primary.
461+
logger.info(f"waiting for the database service to be ready on {primary_name}")
462+
assert await is_postgresql_ready(ops_test, primary_name, use_ip_from_inside=True)
460463

461464
# Verify that connection is possible.
462465
logger.info("checking whether the connectivity to the database is working")
463466
assert await is_connection_possible(
464-
ops_test, primary_name
467+
ops_test, primary_name, use_ip_from_inside=True
465468
), "Connection is not possible after network restore"
466469

467-
await is_cluster_updated(ops_test, primary_name)
470+
await is_cluster_updated(ops_test, primary_name, use_ip_from_inside=True)
468471

469472

470473
@pytest.mark.group(1)
471-
@pytest.mark.unstable
472474
async def test_network_cut_without_ip_change(
473475
ops_test: OpsTest, continuous_writes, primary_start_timeout
474476
):
@@ -516,7 +518,7 @@ async def test_network_cut_without_ip_change(
516518

517519
async with ops_test.fast_forward():
518520
logger.info("checking whether writes are increasing")
519-
await are_writes_increasing(ops_test, primary_name)
521+
await are_writes_increasing(ops_test, primary_name, use_ip_from_inside=True)
520522

521523
logger.info("checking whether a new primary was elected")
522524
# Verify that a new primary gets elected (ie old primary is secondary).
@@ -533,10 +535,14 @@ async def test_network_cut_without_ip_change(
533535
async with ops_test.fast_forward():
534536
await ops_test.model.wait_for_idle(apps=[app], status="active")
535537

538+
# Verify that the database service got restarted and is ready in the old primary.
539+
logger.info(f"waiting for the database service to be ready on {primary_name}")
540+
assert await is_postgresql_ready(ops_test, primary_name)
541+
536542
# Verify that connection is possible.
537543
logger.info("checking whether the connectivity to the database is working")
538544
assert await is_connection_possible(
539545
ops_test, primary_name
540546
), "Connection is not possible after network restore"
541547

542-
await is_cluster_updated(ops_test, primary_name)
548+
await is_cluster_updated(ops_test, primary_name, use_ip_from_inside=True)

0 commit comments

Comments
 (0)