Skip to content

Commit 4c3c9e7

Browse files
authored
[DPE-5665] move pertanent operations to storage_detached (#351)
* move permanent operations to storage_detached * fmt + lint * apply original exit strategy to storage detached
1 parent f9224f3 commit 4c3c9e7

File tree

2 files changed

+80
-46
lines changed

2 files changed

+80
-46
lines changed

src/charm.py

Lines changed: 72 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
import json
66
import logging
77
import re
8-
import time
98
from pathlib import Path
109
from typing import Any, Dict, List, Optional, Set
1110

@@ -85,9 +84,11 @@
8584
from exceptions import (
8685
AdminUserCreationError,
8786
ContainerNotReadyError,
87+
EarlyRemovalOfConfigServerError,
8888
FailedToUpdateFilesystem,
8989
MissingSecretError,
9090
NotConfigServerError,
91+
UnitStillInReplicaSet,
9192
)
9293
from upgrades import kubernetes_upgrades
9394
from upgrades.mongodb_upgrades import MongoDBUpgrade
@@ -118,6 +119,8 @@ def __init__(self, *args):
118119
self.framework.observe(self.on.mongod_pebble_ready, self._on_mongod_pebble_ready)
119120
self.framework.observe(self.on.config_changed, self._on_config_changed)
120121
self.framework.observe(self.on.start, self._on_start)
122+
self.framework.observe(self.on.stop, self._on_stop)
123+
self.framework.observe(self.on.mongodb_storage_detaching, self.mongodb_storage_detaching)
121124
self.framework.observe(self.on.upgrade_charm, self._on_upgrade)
122125
self.framework.observe(self.on.update_status, self._on_update_status)
123126
self.framework.observe(
@@ -137,7 +140,6 @@ def __init__(self, *args):
137140

138141
self.framework.observe(self.on.get_password_action, self._on_get_password)
139142
self.framework.observe(self.on.set_password_action, self._on_set_password)
140-
self.framework.observe(self.on.stop, self._on_stop)
141143

142144
self.framework.observe(self.on.secret_remove, self._on_secret_remove)
143145
self.framework.observe(self.on.secret_changed, self._on_secret_changed)
@@ -397,21 +399,6 @@ def db_initialised(self) -> bool:
397399
"""Check if MongoDB is initialised."""
398400
return json.loads(self.app_peer_data.get("db_initialised", "false"))
399401

400-
@property
401-
def unit_departed(self) -> bool:
402-
"""Whether the unit has departed or not."""
403-
return json.loads(self.unit_peer_data.get("unit_departed", "false"))
404-
405-
@unit_departed.setter
406-
def unit_departed(self, value: bool) -> None:
407-
"""Set the unit_departed flag."""
408-
if isinstance(value, bool):
409-
self.unit_peer_data["unit_departed"] = json.dumps(value)
410-
else:
411-
raise ValueError(
412-
f"'unit_departed' must be a boolean value. Provided: {value} is of type {type(value)}"
413-
)
414-
415402
def is_role_changed(self) -> bool:
416403
"""Checks if application is running in provided role."""
417404
return self.role != self.model.config["role"]
@@ -542,6 +529,11 @@ def get_charm_internal_revision(self) -> str:
542529
with open(Config.CHARM_INTERNAL_VERSION_FILE, "r") as f:
543530
return f.read().strip()
544531

532+
@property
533+
def _is_removing_last_replica(self) -> bool:
534+
"""Returns True if the last replica (juju unit) is getting removed."""
535+
return self.app.planned_units() == 0 and len(self.peers_units) == 0
536+
545537
# END: properties
546538

547539
# BEGIN: generic helper methods
@@ -802,10 +794,6 @@ def _relation_changes_handler(self, event: RelationEvent) -> None:
802794
self._connect_mongodb_exporter()
803795
self._connect_pbm_agent()
804796

805-
if isinstance(event, RelationDepartedEvent):
806-
if event.departing_unit.name == self.unit.name:
807-
self.unit_departed = True
808-
809797
if not self.unit.is_leader():
810798
return
811799

@@ -881,28 +869,6 @@ def __handle_partition_on_stop(self) -> None:
881869
kubernetes_upgrades.partition.set(app_name=self.app.name, value=current_unit_number)
882870
logger.debug(f"Partition set to {current_unit_number} during stop event")
883871

884-
def __handle_relation_departed_on_stop(self) -> None:
885-
"""Leaves replicaset.
886-
887-
If the unit has not already left the replica set, this function
888-
attempts to block operations until the unit is removed. Note that with
889-
how Juju currently operates, we only have 30 seconds until SIGTERM
890-
command, so we are by no means guaranteed to have removed the replica
891-
before the pod is removed. However the leader will reconfigure the
892-
replica set if this is the case on `update status`.
893-
"""
894-
logger.debug(f"{self.unit.name} blocking on_stop")
895-
is_in_replica_set = True
896-
timeout = UNIT_REMOVAL_TIMEOUT
897-
while is_in_replica_set and timeout > 0:
898-
is_in_replica_set = self.is_unit_in_replica_set()
899-
time.sleep(1)
900-
timeout -= 1
901-
if timeout < 0:
902-
raise Exception(f"{self.unit.name}.on_stop timeout exceeded")
903-
logger.debug("{self.unit.name} releasing on_stop")
904-
self.unit_departed = False
905-
906872
def __handle_upgrade_on_stop(self) -> None:
907873
"""Sets the unit state to RESTARTING and step down from replicaset.
908874
@@ -926,13 +892,62 @@ def __handle_upgrade_on_stop(self) -> None:
926892

927893
def _on_stop(self, event) -> None:
928894
self.__handle_partition_on_stop()
929-
if self.unit_departed:
930-
self.__handle_relation_departed_on_stop()
931895
if not self.upgrade._upgrade:
932896
logger.debug("Peer relation missing during stop event")
933897
return
934898
self.__handle_upgrade_on_stop()
935899

900+
def mongodb_storage_detaching(self, event) -> None:
901+
"""Before storage detaches, allow removing unit to remove itself from the set.
902+
903+
If the removing unit is primary also allow it to step down and elect another unit as
904+
primary while it still has access to its storage.
905+
"""
906+
if self.upgrade_in_progress:
907+
# We cannot defer and prevent a user from removing a unit, log a warning instead.
908+
logger.warning(
909+
"Removing replicas during an upgrade is not supported. The charm may be in a broken, unrecoverable state"
910+
)
911+
912+
# A single replica cannot step down as primary and we cannot reconfigure the replica set to
913+
# have 0 members.
914+
if self._is_removing_last_replica:
915+
# removing config-server from a sharded cluster can be disaterous.
916+
if self.is_role(Config.Role.CONFIG_SERVER) and self.config_server.has_shards():
917+
current_shards = self.config_server.get_related_shards()
918+
early_removal_message = f"Cannot remove config-server, still related to shards {', '.join(current_shards)}"
919+
logger.error(early_removal_message)
920+
raise EarlyRemovalOfConfigServerError(early_removal_message)
921+
922+
# cannot drain shard after storage detached.
923+
if self.is_role(Config.Role.SHARD) and self.shard.has_config_server():
924+
logger.info("Wait for shard to drain before detaching storage.")
925+
self.status.set_and_share_status(MaintenanceStatus("Draining shard from cluster"))
926+
mongos_hosts = self.shard.get_mongos_hosts()
927+
self.shard.wait_for_draining(mongos_hosts)
928+
logger.info("Shard successfully drained storage.")
929+
930+
try:
931+
# retries over a period of 10 minutes in an attempt to resolve race conditions it is
932+
logger.debug("Removing %s from replica set", self.unit_host(self.unit))
933+
for attempt in Retrying(
934+
stop=stop_after_attempt(600),
935+
wait=wait_fixed(1),
936+
reraise=True,
937+
):
938+
with attempt:
939+
# in K8s we have the leader remove the unit from the replica set to reduce race
940+
# conditions
941+
if self.is_unit_in_replica_set():
942+
raise UnitStillInReplicaSet()
943+
944+
except NotReadyError:
945+
logger.info(
946+
"Failed to remove %s from replica set, another member is syncing", self.unit.name
947+
)
948+
except PyMongoError as e:
949+
logger.error("Failed to remove %s from replica set, error=%r", self.unit.name, e)
950+
936951
def _on_update_status(self, event: UpdateStatusEvent):
937952
# user-made mistakes might result in other incorrect statues. Prioritise informing users of
938953
# their mistake.
@@ -1206,6 +1221,17 @@ def _init_backup_user(self):
12061221
# END: user management
12071222

12081223
# BEGIN: helper functions
1224+
def _update_related_hosts(self, event) -> None:
1225+
# app relations should be made aware of the new set of hosts
1226+
try:
1227+
if not self.is_role(Config.Role.SHARD):
1228+
self.client_relations.update_app_relation_data()
1229+
self.config_server.update_mongos_hosts()
1230+
self.cluster.update_config_server_db(event)
1231+
except PyMongoError as e:
1232+
logger.error("Deferring on updating app relation data since: error: %r", e)
1233+
event.defer()
1234+
return
12091235

12101236
def _is_user_created(self, user: MongoDBUser) -> bool:
12111237
return f"{user.get_username()}-user-created" in self.app_peer_data
@@ -1377,7 +1403,7 @@ def _add_units_from_replica_set(
13771403
mongo.add_replset_member(member)
13781404

13791405
def _remove_units_from_replica_set(
1380-
self, evemt, mongo: MongoDBConnection, units_to_remove: Set[str]
1406+
self, event, mongo: MongoDBConnection, units_to_remove: Set[str]
13811407
) -> None:
13821408
for member in units_to_remove:
13831409
logger.debug("Removing %s from the replica set", member)

src/exceptions.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,10 @@ class ApplicationHostNotFoundError(MongoError):
1616
"""Raised when a queried host is not in the application peers or the current host."""
1717

1818

19+
class EarlyRemovalOfConfigServerError(Exception):
20+
"""Raised when there is an attempt to remove a config-server, while related to a shard."""
21+
22+
1923
class MongoSecretError(MongoError):
2024
"""Common parent for all Mongo Secret Exceptions."""
2125

@@ -42,3 +46,7 @@ class ContainerNotReadyError(Exception):
4246

4347
class FailedToUpdateFilesystem(Exception):
4448
"""Raised when the container is not ready for a replan of services."""
49+
50+
51+
class UnitStillInReplicaSet(Exception):
52+
"""Raised when the container is not ready for a replan of services."""

0 commit comments

Comments
 (0)