|
5 | 5 | import json
|
6 | 6 | import logging
|
7 | 7 | import re
|
8 |
| -import time |
9 | 8 | from typing import Any, Dict, List, Optional, Set
|
10 | 9 |
|
11 | 10 | import jinja2
|
|
75 | 74 | )
|
76 | 75 |
|
77 | 76 | from config import Config
|
78 |
| -from exceptions import AdminUserCreationError, MissingSecretError |
| 77 | +from exceptions import ( |
| 78 | + AdminUserCreationError, |
| 79 | + EarlyRemovalOfConfigServerError, |
| 80 | + MissingSecretError, |
| 81 | +) |
79 | 82 |
|
80 | 83 | logger = logging.getLogger(__name__)
|
81 | 84 |
|
|
85 | 88 | UNIT_SCOPE = Config.Relations.UNIT_SCOPE
|
86 | 89 | Scopes = Config.Relations.Scopes
|
87 | 90 |
|
| 91 | +ONE_HOUR = 3600 |
| 92 | +HALF_MINUTE = 30 |
| 93 | +ONE_MINUTE = 60 |
88 | 94 | USER_CREATING_MAX_ATTEMPTS = 5
|
89 | 95 | USER_CREATION_COOLDOWN = 30
|
90 | 96 | REPLICA_SET_INIT_CHECK_TIMEOUT = 10
|
@@ -117,7 +123,7 @@ def __init__(self, *args):
|
117 | 123 |
|
118 | 124 | self.framework.observe(self.on.get_password_action, self._on_get_password)
|
119 | 125 | self.framework.observe(self.on.set_password_action, self._on_set_password)
|
120 |
| - self.framework.observe(self.on.stop, self._on_stop) |
| 126 | + self.framework.observe(self.on.mongodb_storage_detaching, self.mongodb_storage_detaching) |
121 | 127 |
|
122 | 128 | self.framework.observe(self.on.secret_remove, self._on_secret_remove)
|
123 | 129 | self.framework.observe(self.on.secret_changed, self._on_secret_changed)
|
@@ -153,6 +159,10 @@ def __init__(self, *args):
|
153 | 159 | )
|
154 | 160 |
|
155 | 161 | # BEGIN: properties
|
| 162 | + @property |
| 163 | + def _is_removing_last_replica(self) -> bool: |
| 164 | + """Returns True if the last replica (juju unit) is getting removed.""" |
| 165 | + return self.app.planned_units() == 0 and len(self.peers_units) == 0 |
156 | 166 |
|
157 | 167 | @property
|
158 | 168 | def monitoring_jobs(self) -> list[dict[str, Any]]:
|
@@ -691,10 +701,6 @@ def _relation_changes_handler(self, event: RelationEvent) -> None:
|
691 | 701 | self._connect_mongodb_exporter()
|
692 | 702 | self._connect_pbm_agent()
|
693 | 703 |
|
694 |
| - if isinstance(event, RelationDepartedEvent): |
695 |
| - if event.departing_unit.name == self.unit.name: |
696 |
| - self.unit_peer_data.setdefault("unit_departed", "True") |
697 |
| - |
698 | 704 | if not self.unit.is_leader():
|
699 | 705 | return
|
700 | 706 |
|
@@ -759,19 +765,72 @@ def _reconcile_mongo_hosts_and_users(self, event: RelationEvent) -> None:
|
759 | 765 | logger.info("Deferring reconfigure: error=%r", e)
|
760 | 766 | event.defer()
|
761 | 767 |
|
762 |
| - def _on_stop(self, event) -> None: |
763 |
| - if "True" == self.unit_peer_data.get("unit_departed", "False"): |
764 |
| - logger.debug(f"{self.unit.name} blocking on_stop") |
765 |
| - is_in_replica_set = True |
766 |
| - timeout = UNIT_REMOVAL_TIMEOUT |
767 |
| - while is_in_replica_set and timeout > 0: |
768 |
| - is_in_replica_set = self.is_unit_in_replica_set() |
769 |
| - time.sleep(1) |
770 |
| - timeout -= 1 |
771 |
| - if timeout < 0: |
772 |
| - raise Exception(f"{self.unit.name}.on_stop timeout exceeded") |
773 |
| - logger.debug(f"{self.unit.name} releasing on_stop") |
774 |
| - self.unit_peer_data["unit_departed"] = "" |
| 768 | + def update_termination_grace_period(self, seconds: int) -> None: |
| 769 | + """Patch the termination grace period for the stateful set.""" |
| 770 | + pass |
| 771 | + |
| 772 | + def mongodb_storage_detaching(self, event) -> None: |
| 773 | + """Before storage detaches, allow removing unit to remove itself from the set. |
| 774 | +
|
| 775 | + If the removing unit is primary also allow it to step down and elect another unit as |
| 776 | + primary while it still has access to its storage. |
| 777 | + """ |
| 778 | + # self.update_termination_grace_period(ONE_HOUR) |
| 779 | + # if time_left < ONE_MINUTE: |
| 780 | + # time_left = (datetime.now() - start_time).seconds < 3600 |
| 781 | + |
| 782 | + if self.upgrade_in_progress: |
| 783 | + # We cannot defer and prevent a user from removing a unit, log a warning instead. |
| 784 | + logger.warning( |
| 785 | + "Removing replicas during an upgrade is not supported. The charm may be in a broken, unrecoverable state" |
| 786 | + ) |
| 787 | + |
| 788 | + # A single replica cannot step down as primary and we cannot reconfigure the replica set to |
| 789 | + # have 0 members. |
| 790 | + if self._is_removing_last_replica: |
| 791 | + # removing config-server from a sharded cluster can be disaterous. |
| 792 | + if self.is_role(Config.Role.CONFIG_SERVER) and self.config_server.has_shards(): |
| 793 | + current_shards = self.config_server.get_related_shards() |
| 794 | + early_removal_message = f"Cannot remove config-server, still related to shards {', '.join(current_shards)}" |
| 795 | + logger.error(early_removal_message) |
| 796 | + # question: what happens in ks if you raise in storage detached? I assume the pod |
| 797 | + # is still removed |
| 798 | + raise EarlyRemovalOfConfigServerError(early_removal_message) |
| 799 | + |
| 800 | + # cannot drain shard after storage detached. |
| 801 | + if self.is_role(Config.Role.SHARD) and self.shard.has_config_server(): |
| 802 | + logger.info("Wait for shard to drain before detaching storage.") |
| 803 | + self.status.set_and_share_status(MaintenanceStatus("Draining shard from cluster")) |
| 804 | + mongos_hosts = self.shard.get_mongos_hosts() |
| 805 | + # TODO need to update this function to attempt to patch the statefulset |
| 806 | + self.shard.wait_for_draining(mongos_hosts) |
| 807 | + logger.info("Shard successfully drained storage.") |
| 808 | + |
| 809 | + self.update_termination_grace_period(HALF_MINUTE) |
| 810 | + return |
| 811 | + |
| 812 | + try: |
| 813 | + # retries over a period of 10 minutes in an attempt to resolve race conditions it is |
| 814 | + # not possible to defer in storage detached. |
| 815 | + logger.debug("Removing %s from replica set", self.unit_host(self.unit)) |
| 816 | + for attempt in Retrying( |
| 817 | + stop=stop_after_attempt(10), |
| 818 | + wait=wait_fixed(1), |
| 819 | + reraise=True, |
| 820 | + ): |
| 821 | + with attempt: |
| 822 | + # remove_replset_member retries for 60 seconds |
| 823 | + with MongoDBConnection(self.mongodb_config) as mongo: |
| 824 | + mongo.remove_replset_member(self.unit_host(self.unit)) |
| 825 | + |
| 826 | + except NotReadyError: |
| 827 | + logger.info( |
| 828 | + "Failed to remove %s from replica set, another member is syncing", self.unit.name |
| 829 | + ) |
| 830 | + except PyMongoError as e: |
| 831 | + logger.error("Failed to remove %s from replica set, error=%r", self.unit.name, e) |
| 832 | + |
| 833 | + self.update_termination_grace_period(HALF_MINUTE) |
775 | 834 |
|
776 | 835 | def _on_update_status(self, event: UpdateStatusEvent):
|
777 | 836 | # user-made mistakes might result in other incorrect statues. Prioritise informing users of
|
|
0 commit comments