Skip to content

Commit 7299748

Browse files
authored
[DPE-4532] Increase timeout and terminate processes that are still up (#514)
* Increase timeou and log unit that is still up * Early fail * Bump coverage * Restore pyproj * Bump coverage * Bump libs * Bump coverage * Revert cluster test * Try to rekill process * Revert removed assert
1 parent 90d6bd2 commit 7299748

File tree

4 files changed

+40
-6
lines changed

4 files changed

+40
-6
lines changed

lib/charms/data_platform_libs/v0/data_interfaces.py

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -331,7 +331,7 @@ def _on_topic_requested(self, event: TopicRequestedEvent):
331331

332332
# Increment this PATCH version before using `charmcraft publish-lib` or reset
333333
# to 0 if you are raising the major API version
334-
LIBPATCH = 37
334+
LIBPATCH = 38
335335

336336
PYDEPS = ["ops>=2.0.0"]
337337

@@ -2606,6 +2606,14 @@ def set_version(self, relation_id: int, version: str) -> None:
26062606
"""
26072607
self.update_relation_data(relation_id, {"version": version})
26082608

2609+
def set_subordinated(self, relation_id: int) -> None:
2610+
"""Raises the subordinated flag in the application relation databag.
2611+
2612+
Args:
2613+
relation_id: the identifier for a particular relation.
2614+
"""
2615+
self.update_relation_data(relation_id, {"subordinated": "true"})
2616+
26092617

26102618
class DatabaseProviderEventHandlers(EventHandlers):
26112619
"""Provider-side of the database relation handlers."""
@@ -2842,6 +2850,21 @@ def _on_relation_created_event(self, event: RelationCreatedEvent) -> None:
28422850

28432851
def _on_relation_changed_event(self, event: RelationChangedEvent) -> None:
28442852
"""Event emitted when the database relation has changed."""
2853+
is_subordinate = False
2854+
remote_unit_data = None
2855+
for key in event.relation.data.keys():
2856+
if isinstance(key, Unit) and not key.name.startswith(self.charm.app.name):
2857+
remote_unit_data = event.relation.data[key]
2858+
elif isinstance(key, Application) and key.name != self.charm.app.name:
2859+
is_subordinate = event.relation.data[key].get("subordinated") == "true"
2860+
2861+
if is_subordinate:
2862+
if not remote_unit_data:
2863+
return
2864+
2865+
if remote_unit_data.get("state") != "ready":
2866+
return
2867+
28452868
# Check which data has changed to emit customs events.
28462869
diff = self._diff(event)
28472870

lib/charms/data_platform_libs/v0/upgrade.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -285,7 +285,7 @@ def restart(self, event) -> None:
285285

286286
# Increment this PATCH version before using `charmcraft publish-lib` or reset
287287
# to 0 if you are raising the major API version
288-
LIBPATCH = 17
288+
LIBPATCH = 18
289289

290290
PYDEPS = ["pydantic>=1.10,<2", "poetry-core"]
291291

@@ -921,7 +921,7 @@ def _on_upgrade_charm(self, event: UpgradeCharmEvent) -> None:
921921
self.charm.unit.status = WaitingStatus("other units upgrading first...")
922922
self.peer_relation.data[self.charm.unit].update({"state": "ready"})
923923

924-
if self.charm.app.planned_units() == 1:
924+
if len(self.app_units) == 1:
925925
# single unit upgrade, emit upgrade_granted event right away
926926
getattr(self.on, "upgrade_granted").emit()
927927

tests/integration/ha_tests/helpers.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ class ProcessRunningError(Exception):
5959
"""Raised when a process is running when it is not expected to be."""
6060

6161

62-
async def are_all_db_processes_down(ops_test: OpsTest, process: str) -> bool:
62+
async def are_all_db_processes_down(ops_test: OpsTest, process: str, signal: str) -> bool:
6363
"""Verifies that all units of the charm do not have the DB process running."""
6464
app = await app_name(ops_test)
6565
if "/" in process:
@@ -68,7 +68,7 @@ async def are_all_db_processes_down(ops_test: OpsTest, process: str) -> bool:
6868
pgrep_cmd = ("pgrep", "-x", process)
6969

7070
try:
71-
for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(3)):
71+
for attempt in Retrying(stop=stop_after_delay(400), wait=wait_fixed(3)):
7272
with attempt:
7373
for unit in ops_test.model.applications[app].units:
7474
_, processes, _ = await ops_test.juju("ssh", unit.name, *pgrep_cmd)
@@ -79,6 +79,9 @@ async def are_all_db_processes_down(ops_test: OpsTest, process: str) -> bool:
7979

8080
# If something was returned, there is a running process.
8181
if len(processes) > 0:
82+
logger.info("Unit %s not yet down" % unit.name)
83+
# Try to rekill the unit
84+
await send_signal_to_process(ops_test, unit.name, process, signal)
8285
raise ProcessRunningError
8386
except RetryError:
8487
return False

tests/integration/ha_tests/test_self_healing.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@ async def test_build_and_deploy(ops_test: OpsTest) -> None:
9595

9696

9797
@pytest.mark.group(1)
98+
@pytest.mark.abort_on_fail
9899
async def test_storage_re_use(ops_test, continuous_writes):
99100
"""Verifies that database units with attached storage correctly repurpose storage.
100101
@@ -142,6 +143,7 @@ async def test_storage_re_use(ops_test, continuous_writes):
142143

143144

144145
@pytest.mark.group(1)
146+
@pytest.mark.abort_on_fail
145147
@pytest.mark.parametrize("process", DB_PROCESSES)
146148
async def test_kill_db_process(
147149
ops_test: OpsTest, process: str, continuous_writes, primary_start_timeout
@@ -170,6 +172,7 @@ async def test_kill_db_process(
170172

171173

172174
@pytest.mark.group(1)
175+
@pytest.mark.abort_on_fail
173176
@pytest.mark.parametrize("process", DB_PROCESSES)
174177
async def test_freeze_db_process(
175178
ops_test: OpsTest, process: str, continuous_writes, primary_start_timeout
@@ -208,6 +211,7 @@ async def test_freeze_db_process(
208211

209212

210213
@pytest.mark.group(1)
214+
@pytest.mark.abort_on_fail
211215
@pytest.mark.parametrize("process", DB_PROCESSES)
212216
async def test_restart_db_process(
213217
ops_test: OpsTest, process: str, continuous_writes, primary_start_timeout
@@ -236,6 +240,7 @@ async def test_restart_db_process(
236240

237241

238242
@pytest.mark.group(1)
243+
@pytest.mark.abort_on_fail
239244
@pytest.mark.parametrize("process", DB_PROCESSES)
240245
@pytest.mark.parametrize("signal", ["SIGTERM", "SIGKILL"])
241246
async def test_full_cluster_restart(
@@ -272,7 +277,7 @@ async def test_full_cluster_restart(
272277
# of all replicas being down at the same time.
273278
try:
274279
assert await are_all_db_processes_down(
275-
ops_test, process
280+
ops_test, process, signal
276281
), "Not all units down at the same time."
277282
finally:
278283
if process == PATRONI_PROCESS:
@@ -304,6 +309,7 @@ async def test_full_cluster_restart(
304309

305310

306311
@pytest.mark.group(1)
312+
@pytest.mark.abort_on_fail
307313
@pytest.mark.unstable
308314
async def test_forceful_restart_without_data_and_transaction_logs(
309315
ops_test: OpsTest,
@@ -380,6 +386,7 @@ async def test_forceful_restart_without_data_and_transaction_logs(
380386

381387

382388
@pytest.mark.group(1)
389+
@pytest.mark.abort_on_fail
383390
async def test_network_cut(ops_test: OpsTest, continuous_writes, primary_start_timeout):
384391
"""Completely cut and restore network."""
385392
# Locate primary unit.
@@ -468,6 +475,7 @@ async def test_network_cut(ops_test: OpsTest, continuous_writes, primary_start_t
468475

469476

470477
@pytest.mark.group(1)
478+
@pytest.mark.abort_on_fail
471479
async def test_network_cut_without_ip_change(
472480
ops_test: OpsTest, continuous_writes, primary_start_timeout
473481
):

0 commit comments

Comments
 (0)