Skip to content

Commit fc992f0

Browse files
committed
patch stateful set
1 parent 9c88b59 commit fc992f0

File tree

1 file changed

+76
-13
lines changed

1 file changed

+76
-13
lines changed

src/charm.py

Lines changed: 76 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import json
66
import logging
77
import re
8+
import time
89
from typing import Any, Dict, List, Optional, Set
910

1011
import jinja2
@@ -41,6 +42,9 @@
4142
CrossAppVersionChecker,
4243
get_charm_revision,
4344
)
45+
from lightkube import Client
46+
from lightkube.resources.apps_v1 import StatefulSet
47+
from lightkube.types import PatchType
4448
from ops.charm import (
4549
ActionEvent,
4650
CharmBase,
@@ -88,9 +92,8 @@
8892
UNIT_SCOPE = Config.Relations.UNIT_SCOPE
8993
Scopes = Config.Relations.Scopes
9094

91-
ONE_HOUR = 3600
92-
HALF_MINUTE = 30
9395
ONE_MINUTE = 60
96+
ONE_YEAR = 31540000
9497
USER_CREATING_MAX_ATTEMPTS = 5
9598
USER_CREATION_COOLDOWN = 30
9699
REPLICA_SET_INIT_CHECK_TIMEOUT = 10
@@ -101,10 +104,10 @@ class MongoDBCharm(CharmBase):
101104

102105
def __init__(self, *args):
103106
super().__init__(*args)
104-
105107
self.framework.observe(self.on.mongod_pebble_ready, self._on_mongod_pebble_ready)
106108
self.framework.observe(self.on.config_changed, self._on_config_changed)
107109
self.framework.observe(self.on.start, self._on_start)
110+
self.framework.observe(self.on.stop, self._on_stop)
108111
self.framework.observe(self.on.update_status, self._on_update_status)
109112
self.framework.observe(
110113
self.on[Config.Relations.PEERS].relation_joined, self._relation_changes_handler
@@ -559,6 +562,7 @@ def _compare_secret_ids(secret_id1: str, secret_id2: str) -> bool:
559562
return False
560563

561564
# BEGIN: charm events
565+
562566
def _on_mongod_pebble_ready(self, event) -> None:
563567
"""Configure MongoDB pebble layer specification."""
564568
# Get a reference the container attribute
@@ -656,6 +660,19 @@ def _on_start(self, event) -> None:
656660
It is needed to install mongodb-clients inside the charm container
657661
to make this function work correctly.
658662
"""
663+
# Patch the stateful set to have an increased termination period to prevent data loss on
664+
# removed shards. As Juju gives us a termination period of 30 seconds:
665+
# https://bugs.launchpad.net/juju/+bug/2035102
666+
667+
# It doesn't matter if we patch the stateful set before or after the charm has started.
668+
# The usual start hooks emitted by juju will have already been emitted, so we can expect
669+
# two rounds of restarts on one or more units (some units that get initialised late will
670+
# only have one round of restarts). The second round of start hooks will be emitted
671+
# **only after the replica set has been initialized**, we have 0 control over that.
672+
673+
if self.unit.is_leader() and self.get_current_termination_period() != ONE_YEAR:
674+
self.update_termination_grace_period(ONE_YEAR)
675+
659676
container = self.unit.get_container(Config.CONTAINER_NAME)
660677
if not container.can_connect():
661678
logger.debug("mongod container is not ready yet.")
@@ -765,20 +782,42 @@ def _reconcile_mongo_hosts_and_users(self, event: RelationEvent) -> None:
765782
logger.info("Deferring reconfigure: error=%r", e)
766783
event.defer()
767784

785+
def get_current_termination_period(self) -> int:
786+
"""Returns the current termination period for the stateful set of this juju application."""
787+
client = Client()
788+
statefulset = client.get(StatefulSet, name=self.app.name, namespace=self.model.name)
789+
return statefulset.spec.template.spec.terminationGracePeriodSeconds
790+
768791
def update_termination_grace_period(self, seconds: int) -> None:
769-
"""Patch the termination grace period for the stateful set."""
770-
pass
792+
"""Patch the termination grace period for the stateful set of this juju application."""
793+
# updating the termination grace period is only useful for shards, whose sudden removal
794+
# can result in data-loss
795+
if not self.is_role(Config.Role.SHARD):
796+
return
797+
798+
client = Client()
799+
patch_data = {
800+
"spec": {
801+
"template": {
802+
"spec": {"terminationGracePeriodSeconds": ONE_YEAR},
803+
"metadata": {"annotations": {"force-update": str(int(time.time()))}},
804+
}
805+
}
806+
}
807+
client.patch(
808+
StatefulSet,
809+
name=self.app.name,
810+
namespace=self.model.name,
811+
obj=patch_data,
812+
patch_type=PatchType.MERGE,
813+
)
771814

772815
def mongodb_storage_detaching(self, event) -> None:
773816
"""Before storage detaches, allow removing unit to remove itself from the set.
774817
775818
If the removing unit is primary also allow it to step down and elect another unit as
776819
primary while it still has access to its storage.
777820
"""
778-
# self.update_termination_grace_period(ONE_HOUR)
779-
# if time_left < ONE_MINUTE:
780-
# time_left = (datetime.now() - start_time).seconds < 3600
781-
782821
if self.upgrade_in_progress:
783822
# We cannot defer and prevent a user from removing a unit, log a warning instead.
784823
logger.warning(
@@ -806,9 +845,6 @@ def mongodb_storage_detaching(self, event) -> None:
806845
self.shard.wait_for_draining(mongos_hosts)
807846
logger.info("Shard successfully drained storage.")
808847

809-
self.update_termination_grace_period(HALF_MINUTE)
810-
return
811-
812848
try:
813849
# retries over a period of 10 minutes in an attempt to resolve race conditions it is
814850
# not possible to defer in storage detached.
@@ -830,7 +866,23 @@ def mongodb_storage_detaching(self, event) -> None:
830866
except PyMongoError as e:
831867
logger.error("Failed to remove %s from replica set, error=%r", self.unit.name, e)
832868

833-
self.update_termination_grace_period(HALF_MINUTE)
869+
def _on_stop(self, _) -> None:
870+
"""Handle on_stop event.
871+
872+
On stop can occur after a user has refreshed, after a unit has been removed, or when a pod
873+
is getting restarted.
874+
"""
875+
# I can add this functionality to mongodb lib - i.e. a function wait_for_new_primary, but
876+
# this is just a POC
877+
waiting = 0
878+
while (
879+
self.unit.name == self.primary and len(self.peers_units) > 1 and waiting < ONE_MINUTE
880+
):
881+
logger.debug("Stepping down current primary, before stopping.")
882+
with MongoDBConnection(self.mongodb_config) as mongo:
883+
mongo.step_down_primary()
884+
time.sleep(1)
885+
waiting += 1
834886

835887
def _on_update_status(self, event: UpdateStatusEvent):
836888
# user-made mistakes might result in other incorrect statues. Prioritise informing users of
@@ -866,6 +918,17 @@ def _on_update_status(self, event: UpdateStatusEvent):
866918

867919
self.status.set_and_share_status(self.status.process_statuses())
868920

921+
# We must ensure that juju does not overwrite our termination period, so we should update
922+
# it as needed. However, updating the termination period can result in an onslaught of
923+
# events, including the upgrade event. To prevent this from messing with upgrades do not
924+
# update the termination period when an upgrade is occurring.
925+
if (
926+
self.unit.is_leader()
927+
and self.get_current_termination_period() != ONE_YEAR
928+
and not self.upgrade_in_progress
929+
):
930+
self.update_termination_grace_period(ONE_YEAR)
931+
869932
# END: charm events
870933

871934
# BEGIN: actions

0 commit comments

Comments
 (0)