|
75 | 75 | )
|
76 | 76 |
|
77 | 77 | from config import Config
|
78 |
| -from exceptions import AdminUserCreationError, MissingSecretError |
| 78 | +from exceptions import AdminUserCreationError, MissingSecretError, EarlyRemovalOfConfigServerError |
79 | 79 |
|
80 | 80 | logger = logging.getLogger(__name__)
|
81 | 81 |
|
|
85 | 85 | UNIT_SCOPE = Config.Relations.UNIT_SCOPE
|
86 | 86 | Scopes = Config.Relations.Scopes
|
87 | 87 |
|
| 88 | +ONE_HOUR = 3600 |
| 89 | +HALF_MINUTE = 30 |
| 90 | +ONE_MINUTE = 60 |
88 | 91 | USER_CREATING_MAX_ATTEMPTS = 5
|
89 | 92 | USER_CREATION_COOLDOWN = 30
|
90 | 93 | REPLICA_SET_INIT_CHECK_TIMEOUT = 10
|
@@ -117,7 +120,7 @@ def __init__(self, *args):
|
117 | 120 |
|
118 | 121 | self.framework.observe(self.on.get_password_action, self._on_get_password)
|
119 | 122 | self.framework.observe(self.on.set_password_action, self._on_set_password)
|
120 |
| - self.framework.observe(self.on.stop, self._on_stop) |
| 123 | + self.framework.observe(self.on.mongodb_storage_detaching, self.mongodb_storage_detaching) |
121 | 124 |
|
122 | 125 | self.framework.observe(self.on.secret_remove, self._on_secret_remove)
|
123 | 126 | self.framework.observe(self.on.secret_changed, self._on_secret_changed)
|
@@ -691,10 +694,6 @@ def _relation_changes_handler(self, event: RelationEvent) -> None:
|
691 | 694 | self._connect_mongodb_exporter()
|
692 | 695 | self._connect_pbm_agent()
|
693 | 696 |
|
694 |
| - if isinstance(event, RelationDepartedEvent): |
695 |
| - if event.departing_unit.name == self.unit.name: |
696 |
| - self.unit_peer_data.setdefault("unit_departed", "True") |
697 |
| - |
698 | 697 | if not self.unit.is_leader():
|
699 | 698 | return
|
700 | 699 |
|
@@ -759,19 +758,71 @@ def _reconcile_mongo_hosts_and_users(self, event: RelationEvent) -> None:
|
759 | 758 | logger.info("Deferring reconfigure: error=%r", e)
|
760 | 759 | event.defer()
|
761 | 760 |
|
762 |
| - def _on_stop(self, event) -> None: |
763 |
| - if "True" == self.unit_peer_data.get("unit_departed", "False"): |
764 |
| - logger.debug(f"{self.unit.name} blocking on_stop") |
765 |
| - is_in_replica_set = True |
766 |
| - timeout = UNIT_REMOVAL_TIMEOUT |
767 |
| - while is_in_replica_set and timeout > 0: |
768 |
| - is_in_replica_set = self.is_unit_in_replica_set() |
769 |
| - time.sleep(1) |
770 |
| - timeout -= 1 |
771 |
| - if timeout < 0: |
772 |
| - raise Exception(f"{self.unit.name}.on_stop timeout exceeded") |
773 |
| - logger.debug(f"{self.unit.name} releasing on_stop") |
774 |
| - self.unit_peer_data["unit_departed"] = "" |
| 761 | + def update_termination_grace_period(self, seconds: int) -> None: |
| 762 | + # kubectl patch statefulset my-statefulset -p '{"spec": {"template": {"spec": {"terminationGracePeriodSeconds": 3600}, "metadata": {"annotations": {"force-update": "'$(date +%s)'"}}}}}' |
| 763 | + pass |
| 764 | + |
| 765 | + def mongodb_storage_detaching(self, event) -> None: |
| 766 | + """Before storage detaches, allow removing unit to remove itself from the set. |
| 767 | + If the removing unit is primary also allow it to step down and elect another unit as |
| 768 | + primary while it still has access to its storage. |
| 769 | + """ |
| 770 | + |
| 771 | + # self.update_termination_grace_period(ONE_HOUR) |
| 772 | + # if time_left < ONE_MINUTE: |
| 773 | + # time_left = (datetime.now() - start_time).seconds < 3600 |
| 774 | + |
| 775 | + if self.upgrade_in_progress: |
| 776 | + # We cannot defer and prevent a user from removing a unit, log a warning instead. |
| 777 | + logger.warning( |
| 778 | + "Removing replicas during an upgrade is not supported. The charm may be in a broken, unrecoverable state" |
| 779 | + ) |
| 780 | + |
| 781 | + # A single replica cannot step down as primary and we cannot reconfigure the replica set to |
| 782 | + # have 0 members. |
| 783 | + if self._is_removing_last_replica: |
| 784 | + # removing config-server from a sharded cluster can be disaterous. |
| 785 | + if self.is_role(Config.Role.CONFIG_SERVER) and self.config_server.has_shards(): |
| 786 | + current_shards = self.config_server.get_related_shards() |
| 787 | + early_removal_message = f"Cannot remove config-server, still related to shards {', '.join(current_shards)}" |
| 788 | + logger.error(early_removal_message) |
| 789 | + # question: what happens in ks if you raise in storage detached? I assume the pod is still removed |
| 790 | + raise EarlyRemovalOfConfigServerError(early_removal_message) |
| 791 | + |
| 792 | + # cannot drain shard after storage detached. |
| 793 | + if self.is_role(Config.Role.SHARD) and self.shard.has_config_server(): |
| 794 | + logger.info("Wait for shard to drain before detaching storage.") |
| 795 | + self.status.set_and_share_status(MaintenanceStatus("Draining shard from cluster")) |
| 796 | + mongos_hosts = self.shard.get_mongos_hosts() |
| 797 | + # TODO need to update this function to attempt to patch the statefulset |
| 798 | + self.shard.wait_for_draining(mongos_hosts) |
| 799 | + logger.info("Shard successfully drained storage.") |
| 800 | + |
| 801 | + self.update_termination_grace_period(HALF_MINUTE) |
| 802 | + return |
| 803 | + |
| 804 | + try: |
| 805 | + # retries over a period of 10 minutes in an attempt to resolve race conditions it is |
| 806 | + # not possible to defer in storage detached. |
| 807 | + logger.debug("Removing %s from replica set", self.unit_host(self.unit)) |
| 808 | + for attempt in Retrying( |
| 809 | + stop=stop_after_attempt(10), |
| 810 | + wait=wait_fixed(1), |
| 811 | + reraise=True, |
| 812 | + ): |
| 813 | + with attempt: |
| 814 | + # remove_replset_member retries for 60 seconds |
| 815 | + with MongoDBConnection(self.mongodb_config) as mongo: |
| 816 | + mongo.remove_replset_member(self.unit_host(self.unit)) |
| 817 | + |
| 818 | + except NotReadyError: |
| 819 | + logger.info( |
| 820 | + "Failed to remove %s from replica set, another member is syncing", self.unit.name |
| 821 | + ) |
| 822 | + except PyMongoError as e: |
| 823 | + logger.error("Failed to remove %s from replica set, error=%r", self.unit.name, e) |
| 824 | + |
| 825 | + self.update_termination_grace_period(HALF_MINUTE) |
775 | 826 |
|
776 | 827 | def _on_update_status(self, event: UpdateStatusEvent):
|
777 | 828 | # user-made mistakes might result in other incorrect statues. Prioritise informing users of
|
|
0 commit comments