Skip to content

Commit 2753365

Browse files
[MISC] Split HA tests (#880)
* Split HA tests Signed-off-by: Marcelo Henrique Neppel <[email protected]> * Fix spread configurations Signed-off-by: Marcelo Henrique Neppel <[email protected]> --------- Signed-off-by: Marcelo Henrique Neppel <[email protected]>
1 parent 1425f6d commit 2753365

File tree

6 files changed

+377
-225
lines changed

6 files changed

+377
-225
lines changed
Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,157 @@
1+
#!/usr/bin/env python3
2+
# Copyright 2022 Canonical Ltd.
3+
# See LICENSE file for licensing details.
4+
import logging
5+
from time import sleep
6+
7+
import pytest
8+
from pytest_operator.plugin import OpsTest
9+
from tenacity import Retrying, stop_after_delay, wait_fixed
10+
11+
from ..helpers import (
12+
CHARM_BASE,
13+
)
14+
from .conftest import APPLICATION_NAME
15+
from .helpers import (
16+
METADATA,
17+
add_unit_with_storage,
18+
app_name,
19+
are_writes_increasing,
20+
check_writes,
21+
get_primary,
22+
get_storage_ids,
23+
is_cluster_updated,
24+
is_postgresql_ready,
25+
is_replica,
26+
is_secondary_up_to_date,
27+
reused_replica_storage,
28+
send_signal_to_process,
29+
start_continuous_writes,
30+
storage_type,
31+
)
32+
33+
logger = logging.getLogger(__name__)
34+
35+
APP_NAME = METADATA["name"]
36+
PATRONI_PROCESS = "/snap/charmed-postgresql/[0-9]*/usr/bin/patroni"
37+
POSTGRESQL_PROCESS = "postgres"
38+
DB_PROCESSES = [POSTGRESQL_PROCESS, PATRONI_PROCESS]
39+
MEDIAN_ELECTION_TIME = 10
40+
41+
42+
@pytest.mark.abort_on_fail
43+
async def test_build_and_deploy(ops_test: OpsTest, charm) -> None:
44+
"""Build and deploy three unit of PostgreSQL."""
45+
wait_for_apps = False
46+
# It is possible for users to provide their own cluster for HA testing. Hence, check if there
47+
# is a pre-existing cluster.
48+
if not await app_name(ops_test):
49+
wait_for_apps = True
50+
async with ops_test.fast_forward():
51+
await ops_test.model.deploy(
52+
charm,
53+
num_units=3,
54+
base=CHARM_BASE,
55+
storage={
56+
"archive": {"pool": "lxd-btrfs", "size": 2048},
57+
"data": {"pool": "lxd-btrfs", "size": 2048},
58+
"logs": {"pool": "lxd-btrfs", "size": 2048},
59+
"temp": {"pool": "lxd-btrfs", "size": 2048},
60+
},
61+
config={"profile": "testing"},
62+
)
63+
# Deploy the continuous writes application charm if it wasn't already deployed.
64+
if not await app_name(ops_test, APPLICATION_NAME):
65+
wait_for_apps = True
66+
async with ops_test.fast_forward():
67+
await ops_test.model.deploy(
68+
APPLICATION_NAME,
69+
application_name=APPLICATION_NAME,
70+
base=CHARM_BASE,
71+
channel="edge",
72+
)
73+
74+
if wait_for_apps:
75+
async with ops_test.fast_forward():
76+
await ops_test.model.wait_for_idle(status="active", timeout=1500)
77+
78+
79+
@pytest.mark.abort_on_fail
80+
async def test_storage_re_use(ops_test, continuous_writes):
81+
"""Verifies that database units with attached storage correctly repurpose storage.
82+
83+
It is not enough to verify that Juju attaches the storage. Hence test checks that the
84+
postgresql properly uses the storage that was provided. (ie. doesn't just re-sync everything
85+
from primary, but instead computes a diff between current storage and primary storage.)
86+
"""
87+
app = await app_name(ops_test)
88+
if storage_type(ops_test, app) == "rootfs":
89+
pytest.skip(
90+
"reuse of storage can only be used on deployments with persistent storage not on rootfs deployments"
91+
)
92+
93+
# removing the only replica can be disastrous
94+
if len(ops_test.model.applications[app].units) < 2:
95+
await ops_test.model.applications[app].add_unit(count=1)
96+
await ops_test.model.wait_for_idle(apps=[app], status="active", timeout=1500)
97+
98+
# Start an application that continuously writes data to the database.
99+
await start_continuous_writes(ops_test, app)
100+
101+
# remove a unit and attach it's storage to a new unit
102+
for unit in ops_test.model.applications[app].units:
103+
if await is_replica(ops_test, unit.name):
104+
break
105+
unit_storage_id = get_storage_ids(ops_test, unit.name)
106+
expected_units = len(ops_test.model.applications[app].units) - 1
107+
await ops_test.model.destroy_unit(unit.name)
108+
await ops_test.model.wait_for_idle(
109+
apps=[app], status="active", timeout=1000, wait_for_exact_units=expected_units
110+
)
111+
new_unit = await add_unit_with_storage(ops_test, app, unit_storage_id)
112+
113+
assert await reused_replica_storage(ops_test, new_unit.name), (
114+
"attached storage not properly reused by Postgresql."
115+
)
116+
117+
# Verify that no writes to the database were missed after stopping the writes.
118+
total_expected_writes = await check_writes(ops_test)
119+
120+
# Verify that new instance is up-to-date.
121+
assert await is_secondary_up_to_date(ops_test, new_unit.name, total_expected_writes), (
122+
"new instance not up to date."
123+
)
124+
125+
126+
@pytest.mark.abort_on_fail
127+
@pytest.mark.parametrize("process", DB_PROCESSES)
128+
@pytest.mark.parametrize("signal", ["SIGTERM", "SIGKILL"])
129+
async def test_interruption_db_process(
130+
ops_test: OpsTest, process: str, signal: str, continuous_writes, primary_start_timeout
131+
) -> None:
132+
# Locate primary unit.
133+
app = await app_name(ops_test)
134+
primary_name = await get_primary(ops_test, app)
135+
136+
# Start an application that continuously writes data to the database.
137+
await start_continuous_writes(ops_test, app)
138+
139+
# Interrupt the database process.
140+
await send_signal_to_process(ops_test, primary_name, process, signal)
141+
142+
# Wait some time to elect a new primary.
143+
sleep(MEDIAN_ELECTION_TIME * 6)
144+
145+
async with ops_test.fast_forward():
146+
await are_writes_increasing(ops_test, primary_name)
147+
148+
# Verify that a new primary gets elected (ie old primary is secondary).
149+
for attempt in Retrying(stop=stop_after_delay(60 * 3), wait=wait_fixed(3)):
150+
with attempt:
151+
new_primary_name = await get_primary(ops_test, app)
152+
assert new_primary_name != primary_name
153+
154+
# Verify that the database service got restarted and is ready in the old primary.
155+
assert await is_postgresql_ready(ops_test, primary_name)
156+
157+
await is_cluster_updated(ops_test, primary_name)
Lines changed: 204 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,204 @@
1+
#!/usr/bin/env python3
2+
# Copyright 2022 Canonical Ltd.
3+
# See LICENSE file for licensing details.
4+
import asyncio
5+
import logging
6+
from time import sleep
7+
8+
import pytest
9+
from pytest_operator.plugin import OpsTest
10+
from tenacity import Retrying, stop_after_delay, wait_fixed
11+
12+
from ..helpers import (
13+
CHARM_BASE,
14+
get_password,
15+
)
16+
from .conftest import APPLICATION_NAME
17+
from .helpers import (
18+
METADATA,
19+
ORIGINAL_RESTART_CONDITION,
20+
app_name,
21+
are_all_db_processes_down,
22+
are_writes_increasing,
23+
change_patroni_setting,
24+
check_writes,
25+
fetch_cluster_members,
26+
get_patroni_setting,
27+
get_primary,
28+
is_cluster_updated,
29+
is_postgresql_ready,
30+
send_signal_to_process,
31+
start_continuous_writes,
32+
update_restart_condition,
33+
)
34+
35+
logger = logging.getLogger(__name__)
36+
37+
APP_NAME = METADATA["name"]
38+
PATRONI_PROCESS = "/snap/charmed-postgresql/[0-9]*/usr/bin/patroni"
39+
POSTGRESQL_PROCESS = "postgres"
40+
DB_PROCESSES = [POSTGRESQL_PROCESS, PATRONI_PROCESS]
41+
MEDIAN_ELECTION_TIME = 10
42+
43+
44+
@pytest.mark.abort_on_fail
45+
async def test_build_and_deploy(ops_test: OpsTest, charm) -> None:
46+
"""Build and deploy three unit of PostgreSQL."""
47+
wait_for_apps = False
48+
# It is possible for users to provide their own cluster for HA testing. Hence, check if there
49+
# is a pre-existing cluster.
50+
if not await app_name(ops_test):
51+
wait_for_apps = True
52+
async with ops_test.fast_forward():
53+
await ops_test.model.deploy(
54+
charm,
55+
num_units=3,
56+
base=CHARM_BASE,
57+
storage={
58+
"archive": {"pool": "lxd-btrfs", "size": 2048},
59+
"data": {"pool": "lxd-btrfs", "size": 2048},
60+
"logs": {"pool": "lxd-btrfs", "size": 2048},
61+
"temp": {"pool": "lxd-btrfs", "size": 2048},
62+
},
63+
config={"profile": "testing"},
64+
)
65+
# Deploy the continuous writes application charm if it wasn't already deployed.
66+
if not await app_name(ops_test, APPLICATION_NAME):
67+
wait_for_apps = True
68+
async with ops_test.fast_forward():
69+
await ops_test.model.deploy(
70+
APPLICATION_NAME,
71+
application_name=APPLICATION_NAME,
72+
base=CHARM_BASE,
73+
channel="edge",
74+
)
75+
76+
if wait_for_apps:
77+
async with ops_test.fast_forward():
78+
await ops_test.model.wait_for_idle(status="active", timeout=1500)
79+
80+
81+
@pytest.mark.abort_on_fail
82+
@pytest.mark.parametrize("process", DB_PROCESSES)
83+
async def test_freeze_db_process(
84+
ops_test: OpsTest, process: str, continuous_writes, primary_start_timeout
85+
) -> None:
86+
# Locate primary unit.
87+
app = await app_name(ops_test)
88+
primary_name = await get_primary(ops_test, app)
89+
90+
# Start an application that continuously writes data to the database.
91+
await start_continuous_writes(ops_test, app)
92+
93+
# Freeze the database process.
94+
await send_signal_to_process(ops_test, primary_name, process, "SIGSTOP")
95+
96+
# Wait some time to elect a new primary.
97+
sleep(MEDIAN_ELECTION_TIME * 6)
98+
99+
async with ops_test.fast_forward():
100+
# Verify new writes are continuing by counting the number of writes before and after a
101+
# 3 minutes wait (this is a little more than the loop wait configuration, that is
102+
# considered to trigger a fail-over after primary_start_timeout is changed, and also
103+
# when freezing the DB process it take some more time to trigger the fail-over).
104+
try:
105+
await are_writes_increasing(ops_test, primary_name)
106+
107+
# Verify that a new primary gets elected (ie old primary is secondary).
108+
for attempt in Retrying(stop=stop_after_delay(60 * 3), wait=wait_fixed(3)):
109+
with attempt:
110+
new_primary_name = await get_primary(ops_test, app, down_unit=primary_name)
111+
assert new_primary_name != primary_name
112+
finally:
113+
# Un-freeze the old primary.
114+
await send_signal_to_process(ops_test, primary_name, process, "SIGCONT")
115+
116+
# Verify that the database service got restarted and is ready in the old primary.
117+
assert await is_postgresql_ready(ops_test, primary_name)
118+
119+
await is_cluster_updated(ops_test, primary_name)
120+
121+
122+
@pytest.mark.abort_on_fail
123+
@pytest.mark.parametrize("process", DB_PROCESSES)
124+
@pytest.mark.parametrize("signal", ["SIGTERM", "SIGKILL"])
125+
async def test_full_cluster_restart(
126+
ops_test: OpsTest,
127+
process: str,
128+
signal: str,
129+
continuous_writes,
130+
reset_restart_condition,
131+
loop_wait,
132+
) -> None:
133+
"""This tests checks that a cluster recovers from a full cluster restart.
134+
135+
The test can be called a full cluster crash when the signal sent to the OS process
136+
is SIGKILL.
137+
"""
138+
# Locate primary unit.
139+
app = await app_name(ops_test)
140+
patroni_password = await get_password(ops_test, "patroni")
141+
142+
# Change the loop wait setting to make Patroni wait more time before restarting PostgreSQL.
143+
initial_loop_wait = await get_patroni_setting(ops_test, "loop_wait")
144+
initial_ttl = await get_patroni_setting(ops_test, "ttl")
145+
# loop_wait parameter is limited by ttl value, thus we should increase it first
146+
await change_patroni_setting(ops_test, "ttl", 600, patroni_password, use_random_unit=True)
147+
await change_patroni_setting(
148+
ops_test, "loop_wait", 300, patroni_password, use_random_unit=True
149+
)
150+
151+
# Start an application that continuously writes data to the database.
152+
await start_continuous_writes(ops_test, app)
153+
154+
# Restart all units "simultaneously".
155+
await asyncio.gather(*[
156+
send_signal_to_process(ops_test, unit.name, process, signal)
157+
for unit in ops_test.model.applications[app].units
158+
])
159+
160+
# This test serves to verify behavior when all replicas are down at the same time that when
161+
# they come back online they operate as expected. This check verifies that we meet the criteria
162+
# of all replicas being down at the same time.
163+
try:
164+
assert await are_all_db_processes_down(ops_test, process, signal), (
165+
"Not all units down at the same time."
166+
)
167+
finally:
168+
if process == PATRONI_PROCESS:
169+
awaits = []
170+
for unit in ops_test.model.applications[app].units:
171+
awaits.append(update_restart_condition(ops_test, unit, ORIGINAL_RESTART_CONDITION))
172+
await asyncio.gather(*awaits)
173+
await change_patroni_setting(
174+
ops_test, "loop_wait", initial_loop_wait, patroni_password, use_random_unit=True
175+
)
176+
await change_patroni_setting(
177+
ops_test, "ttl", initial_ttl, patroni_password, use_random_unit=True
178+
)
179+
180+
# Verify all units are up and running.
181+
sleep(30)
182+
for unit in ops_test.model.applications[app].units:
183+
assert await is_postgresql_ready(ops_test, unit.name), (
184+
f"unit {unit.name} not restarted after cluster restart."
185+
)
186+
187+
# Check if a primary is elected
188+
for attempt in Retrying(stop=stop_after_delay(60 * 3), wait=wait_fixed(3)):
189+
with attempt:
190+
new_primary_name = await get_primary(ops_test, app)
191+
assert new_primary_name is not None, "Could not get primary from any unit"
192+
193+
async with ops_test.fast_forward("60s"):
194+
await ops_test.model.wait_for_idle(status="active", timeout=1000)
195+
await are_writes_increasing(ops_test)
196+
197+
# Verify that all units are part of the same cluster.
198+
member_ips = await fetch_cluster_members(ops_test)
199+
ip_addresses = [unit.public_address for unit in ops_test.model.applications[app].units]
200+
assert set(member_ips) == set(ip_addresses), "not all units are part of the same cluster."
201+
202+
# Verify that no writes to the database were missed after stopping the writes.
203+
async with ops_test.fast_forward():
204+
await check_writes(ops_test)

0 commit comments

Comments
 (0)