5
5
import json
6
6
import logging
7
7
import re
8
+ import time
8
9
from typing import Any , Dict , List , Optional , Set
9
10
10
11
import jinja2
41
42
CrossAppVersionChecker ,
42
43
get_charm_revision ,
43
44
)
45
+ from lightkube import Client
46
+ from lightkube .resources .apps_v1 import StatefulSet
47
+ from lightkube .types import PatchType
44
48
from ops .charm import (
45
49
ActionEvent ,
46
50
CharmBase ,
88
92
UNIT_SCOPE = Config .Relations .UNIT_SCOPE
89
93
Scopes = Config .Relations .Scopes
90
94
91
- ONE_HOUR = 3600
92
- HALF_MINUTE = 30
93
95
ONE_MINUTE = 60
96
+ ONE_YEAR = 31540000
94
97
USER_CREATING_MAX_ATTEMPTS = 5
95
98
USER_CREATION_COOLDOWN = 30
96
99
REPLICA_SET_INIT_CHECK_TIMEOUT = 10
@@ -101,10 +104,10 @@ class MongoDBCharm(CharmBase):
101
104
102
105
def __init__ (self , * args ):
103
106
super ().__init__ (* args )
104
-
105
107
self .framework .observe (self .on .mongod_pebble_ready , self ._on_mongod_pebble_ready )
106
108
self .framework .observe (self .on .config_changed , self ._on_config_changed )
107
109
self .framework .observe (self .on .start , self ._on_start )
110
+ self .framework .observe (self .on .stop , self ._on_stop )
108
111
self .framework .observe (self .on .update_status , self ._on_update_status )
109
112
self .framework .observe (
110
113
self .on [Config .Relations .PEERS ].relation_joined , self ._relation_changes_handler
@@ -559,6 +562,7 @@ def _compare_secret_ids(secret_id1: str, secret_id2: str) -> bool:
559
562
return False
560
563
561
564
# BEGIN: charm events
565
+
562
566
def _on_mongod_pebble_ready (self , event ) -> None :
563
567
"""Configure MongoDB pebble layer specification."""
564
568
# Get a reference the container attribute
@@ -656,6 +660,19 @@ def _on_start(self, event) -> None:
656
660
It is needed to install mongodb-clients inside the charm container
657
661
to make this function work correctly.
658
662
"""
663
+ # Patch the stateful set to have an increased termination period to prevent data loss on
664
+ # removed shards. As Juju gives us a termination period of 30 seconds:
665
+ # https://bugs.launchpad.net/juju/+bug/2035102
666
+
667
+ # It doesn't matter if we patch the stateful set before or after the charm has started.
668
+ # The usual start hooks emitted by juju will have already been emitted, so we can expect
669
+ # two rounds of restarts on one or more units (some units that get initialised late will
670
+ # only have one round of restarts). The second round of start hooks will be emitted
671
+ # **only after the replica set has been initialized**, we have 0 control over that.
672
+
673
+ if self .unit .is_leader () and self .get_current_termination_period () != ONE_YEAR :
674
+ self .update_termination_grace_period (ONE_YEAR )
675
+
659
676
container = self .unit .get_container (Config .CONTAINER_NAME )
660
677
if not container .can_connect ():
661
678
logger .debug ("mongod container is not ready yet." )
@@ -765,20 +782,42 @@ def _reconcile_mongo_hosts_and_users(self, event: RelationEvent) -> None:
765
782
logger .info ("Deferring reconfigure: error=%r" , e )
766
783
event .defer ()
767
784
785
+ def get_current_termination_period (self ) -> int :
786
+ """Returns the current termination period for the stateful set of this juju application."""
787
+ client = Client ()
788
+ statefulset = client .get (StatefulSet , name = self .app .name , namespace = self .model .name )
789
+ return statefulset .spec .template .spec .terminationGracePeriodSeconds
790
+
768
791
def update_termination_grace_period (self , seconds : int ) -> None :
769
- """Patch the termination grace period for the stateful set."""
770
- pass
792
+ """Patch the termination grace period for the stateful set of this juju application."""
793
+ # updating the termination grace period is only useful for shards, whose sudden removal
794
+ # can result in data-loss
795
+ if not self .is_role (Config .Role .SHARD ):
796
+ return
797
+
798
+ client = Client ()
799
+ patch_data = {
800
+ "spec" : {
801
+ "template" : {
802
+ "spec" : {"terminationGracePeriodSeconds" : ONE_YEAR },
803
+ "metadata" : {"annotations" : {"force-update" : str (int (time .time ()))}},
804
+ }
805
+ }
806
+ }
807
+ client .patch (
808
+ StatefulSet ,
809
+ name = self .app .name ,
810
+ namespace = self .model .name ,
811
+ obj = patch_data ,
812
+ patch_type = PatchType .MERGE ,
813
+ )
771
814
772
815
def mongodb_storage_detaching (self , event ) -> None :
773
816
"""Before storage detaches, allow removing unit to remove itself from the set.
774
817
775
818
If the removing unit is primary also allow it to step down and elect another unit as
776
819
primary while it still has access to its storage.
777
820
"""
778
- # self.update_termination_grace_period(ONE_HOUR)
779
- # if time_left < ONE_MINUTE:
780
- # time_left = (datetime.now() - start_time).seconds < 3600
781
-
782
821
if self .upgrade_in_progress :
783
822
# We cannot defer and prevent a user from removing a unit, log a warning instead.
784
823
logger .warning (
@@ -806,9 +845,6 @@ def mongodb_storage_detaching(self, event) -> None:
806
845
self .shard .wait_for_draining (mongos_hosts )
807
846
logger .info ("Shard successfully drained storage." )
808
847
809
- self .update_termination_grace_period (HALF_MINUTE )
810
- return
811
-
812
848
try :
813
849
# retries over a period of 10 minutes in an attempt to resolve race conditions it is
814
850
# not possible to defer in storage detached.
@@ -830,7 +866,23 @@ def mongodb_storage_detaching(self, event) -> None:
830
866
except PyMongoError as e :
831
867
logger .error ("Failed to remove %s from replica set, error=%r" , self .unit .name , e )
832
868
833
- self .update_termination_grace_period (HALF_MINUTE )
869
+ def _on_stop (self , _ ) -> None :
870
+ """Handle on_stop event.
871
+
872
+ On stop can occur after a user has refreshed, after a unit has been removed, or when a pod
873
+ is getting restarted.
874
+ """
875
+ # I can add this functionality to mongodb lib - i.e. a function wait_for_new_primary, but
876
+ # this is just a POC
877
+ waiting = 0
878
+ while (
879
+ self .unit .name == self .primary and len (self .peers_units ) > 1 and waiting < ONE_MINUTE
880
+ ):
881
+ logger .debug ("Stepping down current primary, before stopping." )
882
+ with MongoDBConnection (self .mongodb_config ) as mongo :
883
+ mongo .step_down_primary ()
884
+ time .sleep (1 )
885
+ waiting += 1
834
886
835
887
def _on_update_status (self , event : UpdateStatusEvent ):
836
888
# user-made mistakes might result in other incorrect statues. Prioritise informing users of
@@ -866,6 +918,17 @@ def _on_update_status(self, event: UpdateStatusEvent):
866
918
867
919
self .status .set_and_share_status (self .status .process_statuses ())
868
920
921
+ # We must ensure that juju does not overwrite our termination period, so we should update
922
+ # it as needed. However, updating the termination period can result in an onslaught of
923
+ # events, including the upgrade event. To prevent this from messing with upgrades do not
924
+ # update the termination period when an upgrade is occurring.
925
+ if (
926
+ self .unit .is_leader ()
927
+ and self .get_current_termination_period () != ONE_YEAR
928
+ and not self .upgrade_in_progress
929
+ ):
930
+ self .update_termination_grace_period (ONE_YEAR )
931
+
869
932
# END: charm events
870
933
871
934
# BEGIN: actions
0 commit comments