Skip to content

Commit 9c88b59

Browse files
committed
use mongodb-k8s removal strategy + move to storage detached
1 parent f0a7578 commit 9c88b59

File tree

2 files changed

+83
-20
lines changed

2 files changed

+83
-20
lines changed

src/charm.py

Lines changed: 79 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
import json
66
import logging
77
import re
8-
import time
98
from typing import Any, Dict, List, Optional, Set
109

1110
import jinja2
@@ -75,7 +74,11 @@
7574
)
7675

7776
from config import Config
78-
from exceptions import AdminUserCreationError, MissingSecretError
77+
from exceptions import (
78+
AdminUserCreationError,
79+
EarlyRemovalOfConfigServerError,
80+
MissingSecretError,
81+
)
7982

8083
logger = logging.getLogger(__name__)
8184

@@ -85,6 +88,9 @@
8588
UNIT_SCOPE = Config.Relations.UNIT_SCOPE
8689
Scopes = Config.Relations.Scopes
8790

91+
ONE_HOUR = 3600
92+
HALF_MINUTE = 30
93+
ONE_MINUTE = 60
8894
USER_CREATING_MAX_ATTEMPTS = 5
8995
USER_CREATION_COOLDOWN = 30
9096
REPLICA_SET_INIT_CHECK_TIMEOUT = 10
@@ -117,7 +123,7 @@ def __init__(self, *args):
117123

118124
self.framework.observe(self.on.get_password_action, self._on_get_password)
119125
self.framework.observe(self.on.set_password_action, self._on_set_password)
120-
self.framework.observe(self.on.stop, self._on_stop)
126+
self.framework.observe(self.on.mongodb_storage_detaching, self.mongodb_storage_detaching)
121127

122128
self.framework.observe(self.on.secret_remove, self._on_secret_remove)
123129
self.framework.observe(self.on.secret_changed, self._on_secret_changed)
@@ -153,6 +159,10 @@ def __init__(self, *args):
153159
)
154160

155161
# BEGIN: properties
162+
@property
163+
def _is_removing_last_replica(self) -> bool:
164+
"""Returns True if the last replica (juju unit) is getting removed."""
165+
return self.app.planned_units() == 0 and len(self.peers_units) == 0
156166

157167
@property
158168
def monitoring_jobs(self) -> list[dict[str, Any]]:
@@ -691,10 +701,6 @@ def _relation_changes_handler(self, event: RelationEvent) -> None:
691701
self._connect_mongodb_exporter()
692702
self._connect_pbm_agent()
693703

694-
if isinstance(event, RelationDepartedEvent):
695-
if event.departing_unit.name == self.unit.name:
696-
self.unit_peer_data.setdefault("unit_departed", "True")
697-
698704
if not self.unit.is_leader():
699705
return
700706

@@ -759,19 +765,72 @@ def _reconcile_mongo_hosts_and_users(self, event: RelationEvent) -> None:
759765
logger.info("Deferring reconfigure: error=%r", e)
760766
event.defer()
761767

762-
def _on_stop(self, event) -> None:
763-
if "True" == self.unit_peer_data.get("unit_departed", "False"):
764-
logger.debug(f"{self.unit.name} blocking on_stop")
765-
is_in_replica_set = True
766-
timeout = UNIT_REMOVAL_TIMEOUT
767-
while is_in_replica_set and timeout > 0:
768-
is_in_replica_set = self.is_unit_in_replica_set()
769-
time.sleep(1)
770-
timeout -= 1
771-
if timeout < 0:
772-
raise Exception(f"{self.unit.name}.on_stop timeout exceeded")
773-
logger.debug(f"{self.unit.name} releasing on_stop")
774-
self.unit_peer_data["unit_departed"] = ""
768+
def update_termination_grace_period(self, seconds: int) -> None:
769+
"""Patch the termination grace period for the stateful set."""
770+
pass
771+
772+
def mongodb_storage_detaching(self, event) -> None:
773+
"""Before storage detaches, allow removing unit to remove itself from the set.
774+
775+
If the removing unit is primary also allow it to step down and elect another unit as
776+
primary while it still has access to its storage.
777+
"""
778+
# self.update_termination_grace_period(ONE_HOUR)
779+
# if time_left < ONE_MINUTE:
780+
# time_left = (datetime.now() - start_time).seconds < 3600
781+
782+
if self.upgrade_in_progress:
783+
# We cannot defer and prevent a user from removing a unit, log a warning instead.
784+
logger.warning(
785+
"Removing replicas during an upgrade is not supported. The charm may be in a broken, unrecoverable state"
786+
)
787+
788+
# A single replica cannot step down as primary and we cannot reconfigure the replica set to
789+
# have 0 members.
790+
if self._is_removing_last_replica:
791+
# removing config-server from a sharded cluster can be disaterous.
792+
if self.is_role(Config.Role.CONFIG_SERVER) and self.config_server.has_shards():
793+
current_shards = self.config_server.get_related_shards()
794+
early_removal_message = f"Cannot remove config-server, still related to shards {', '.join(current_shards)}"
795+
logger.error(early_removal_message)
796+
# question: what happens in ks if you raise in storage detached? I assume the pod
797+
# is still removed
798+
raise EarlyRemovalOfConfigServerError(early_removal_message)
799+
800+
# cannot drain shard after storage detached.
801+
if self.is_role(Config.Role.SHARD) and self.shard.has_config_server():
802+
logger.info("Wait for shard to drain before detaching storage.")
803+
self.status.set_and_share_status(MaintenanceStatus("Draining shard from cluster"))
804+
mongos_hosts = self.shard.get_mongos_hosts()
805+
# TODO need to update this function to attempt to patch the statefulset
806+
self.shard.wait_for_draining(mongos_hosts)
807+
logger.info("Shard successfully drained storage.")
808+
809+
self.update_termination_grace_period(HALF_MINUTE)
810+
return
811+
812+
try:
813+
# retries over a period of 10 minutes in an attempt to resolve race conditions it is
814+
# not possible to defer in storage detached.
815+
logger.debug("Removing %s from replica set", self.unit_host(self.unit))
816+
for attempt in Retrying(
817+
stop=stop_after_attempt(10),
818+
wait=wait_fixed(1),
819+
reraise=True,
820+
):
821+
with attempt:
822+
# remove_replset_member retries for 60 seconds
823+
with MongoDBConnection(self.mongodb_config) as mongo:
824+
mongo.remove_replset_member(self.unit_host(self.unit))
825+
826+
except NotReadyError:
827+
logger.info(
828+
"Failed to remove %s from replica set, another member is syncing", self.unit.name
829+
)
830+
except PyMongoError as e:
831+
logger.error("Failed to remove %s from replica set, error=%r", self.unit.name, e)
832+
833+
self.update_termination_grace_period(HALF_MINUTE)
775834

776835
def _on_update_status(self, event: UpdateStatusEvent):
777836
# user-made mistakes might result in other incorrect statues. Prioritise informing users of

src/exceptions.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,10 @@ class ApplicationHostNotFoundError(MongoError):
1616
"""Raised when a queried host is not in the application peers or the current host."""
1717

1818

19+
class EarlyRemovalOfConfigServerError(Exception):
20+
"""Raised when there is an attempt to remove a config-server, while related to a shard."""
21+
22+
1923
class MongoSecretError(MongoError):
2024
"""Common parent for all Mongo Secret Exceptions."""
2125

0 commit comments

Comments
 (0)