5
5
import json
6
6
import logging
7
7
import re
8
- import time
9
8
from pathlib import Path
10
9
from typing import Any , Dict , List , Optional , Set
11
10
85
84
from exceptions import (
86
85
AdminUserCreationError ,
87
86
ContainerNotReadyError ,
87
+ EarlyRemovalOfConfigServerError ,
88
88
FailedToUpdateFilesystem ,
89
89
MissingSecretError ,
90
90
NotConfigServerError ,
91
+ UnitStillInReplicaSet ,
91
92
)
92
93
from upgrades import kubernetes_upgrades
93
94
from upgrades .mongodb_upgrades import MongoDBUpgrade
@@ -118,6 +119,8 @@ def __init__(self, *args):
118
119
self .framework .observe (self .on .mongod_pebble_ready , self ._on_mongod_pebble_ready )
119
120
self .framework .observe (self .on .config_changed , self ._on_config_changed )
120
121
self .framework .observe (self .on .start , self ._on_start )
122
+ self .framework .observe (self .on .stop , self ._on_stop )
123
+ self .framework .observe (self .on .mongodb_storage_detaching , self .mongodb_storage_detaching )
121
124
self .framework .observe (self .on .upgrade_charm , self ._on_upgrade )
122
125
self .framework .observe (self .on .update_status , self ._on_update_status )
123
126
self .framework .observe (
@@ -137,7 +140,6 @@ def __init__(self, *args):
137
140
138
141
self .framework .observe (self .on .get_password_action , self ._on_get_password )
139
142
self .framework .observe (self .on .set_password_action , self ._on_set_password )
140
- self .framework .observe (self .on .stop , self ._on_stop )
141
143
142
144
self .framework .observe (self .on .secret_remove , self ._on_secret_remove )
143
145
self .framework .observe (self .on .secret_changed , self ._on_secret_changed )
@@ -397,21 +399,6 @@ def db_initialised(self) -> bool:
397
399
"""Check if MongoDB is initialised."""
398
400
return json .loads (self .app_peer_data .get ("db_initialised" , "false" ))
399
401
400
- @property
401
- def unit_departed (self ) -> bool :
402
- """Whether the unit has departed or not."""
403
- return json .loads (self .unit_peer_data .get ("unit_departed" , "false" ))
404
-
405
- @unit_departed .setter
406
- def unit_departed (self , value : bool ) -> None :
407
- """Set the unit_departed flag."""
408
- if isinstance (value , bool ):
409
- self .unit_peer_data ["unit_departed" ] = json .dumps (value )
410
- else :
411
- raise ValueError (
412
- f"'unit_departed' must be a boolean value. Provided: { value } is of type { type (value )} "
413
- )
414
-
415
402
def is_role_changed (self ) -> bool :
416
403
"""Checks if application is running in provided role."""
417
404
return self .role != self .model .config ["role" ]
@@ -542,6 +529,11 @@ def get_charm_internal_revision(self) -> str:
542
529
with open (Config .CHARM_INTERNAL_VERSION_FILE , "r" ) as f :
543
530
return f .read ().strip ()
544
531
532
+ @property
533
+ def _is_removing_last_replica (self ) -> bool :
534
+ """Returns True if the last replica (juju unit) is getting removed."""
535
+ return self .app .planned_units () == 0 and len (self .peers_units ) == 0
536
+
545
537
# END: properties
546
538
547
539
# BEGIN: generic helper methods
@@ -802,10 +794,6 @@ def _relation_changes_handler(self, event: RelationEvent) -> None:
802
794
self ._connect_mongodb_exporter ()
803
795
self ._connect_pbm_agent ()
804
796
805
- if isinstance (event , RelationDepartedEvent ):
806
- if event .departing_unit .name == self .unit .name :
807
- self .unit_departed = True
808
-
809
797
if not self .unit .is_leader ():
810
798
return
811
799
@@ -881,28 +869,6 @@ def __handle_partition_on_stop(self) -> None:
881
869
kubernetes_upgrades .partition .set (app_name = self .app .name , value = current_unit_number )
882
870
logger .debug (f"Partition set to { current_unit_number } during stop event" )
883
871
884
- def __handle_relation_departed_on_stop (self ) -> None :
885
- """Leaves replicaset.
886
-
887
- If the unit has not already left the replica set, this function
888
- attempts to block operations until the unit is removed. Note that with
889
- how Juju currently operates, we only have 30 seconds until SIGTERM
890
- command, so we are by no means guaranteed to have removed the replica
891
- before the pod is removed. However the leader will reconfigure the
892
- replica set if this is the case on `update status`.
893
- """
894
- logger .debug (f"{ self .unit .name } blocking on_stop" )
895
- is_in_replica_set = True
896
- timeout = UNIT_REMOVAL_TIMEOUT
897
- while is_in_replica_set and timeout > 0 :
898
- is_in_replica_set = self .is_unit_in_replica_set ()
899
- time .sleep (1 )
900
- timeout -= 1
901
- if timeout < 0 :
902
- raise Exception (f"{ self .unit .name } .on_stop timeout exceeded" )
903
- logger .debug ("{self.unit.name} releasing on_stop" )
904
- self .unit_departed = False
905
-
906
872
def __handle_upgrade_on_stop (self ) -> None :
907
873
"""Sets the unit state to RESTARTING and step down from replicaset.
908
874
@@ -926,13 +892,62 @@ def __handle_upgrade_on_stop(self) -> None:
926
892
927
893
def _on_stop (self , event ) -> None :
928
894
self .__handle_partition_on_stop ()
929
- if self .unit_departed :
930
- self .__handle_relation_departed_on_stop ()
931
895
if not self .upgrade ._upgrade :
932
896
logger .debug ("Peer relation missing during stop event" )
933
897
return
934
898
self .__handle_upgrade_on_stop ()
935
899
900
+ def mongodb_storage_detaching (self , event ) -> None :
901
+ """Before storage detaches, allow removing unit to remove itself from the set.
902
+
903
+ If the removing unit is primary also allow it to step down and elect another unit as
904
+ primary while it still has access to its storage.
905
+ """
906
+ if self .upgrade_in_progress :
907
+ # We cannot defer and prevent a user from removing a unit, log a warning instead.
908
+ logger .warning (
909
+ "Removing replicas during an upgrade is not supported. The charm may be in a broken, unrecoverable state"
910
+ )
911
+
912
+ # A single replica cannot step down as primary and we cannot reconfigure the replica set to
913
+ # have 0 members.
914
+ if self ._is_removing_last_replica :
915
+ # removing config-server from a sharded cluster can be disaterous.
916
+ if self .is_role (Config .Role .CONFIG_SERVER ) and self .config_server .has_shards ():
917
+ current_shards = self .config_server .get_related_shards ()
918
+ early_removal_message = f"Cannot remove config-server, still related to shards { ', ' .join (current_shards )} "
919
+ logger .error (early_removal_message )
920
+ raise EarlyRemovalOfConfigServerError (early_removal_message )
921
+
922
+ # cannot drain shard after storage detached.
923
+ if self .is_role (Config .Role .SHARD ) and self .shard .has_config_server ():
924
+ logger .info ("Wait for shard to drain before detaching storage." )
925
+ self .status .set_and_share_status (MaintenanceStatus ("Draining shard from cluster" ))
926
+ mongos_hosts = self .shard .get_mongos_hosts ()
927
+ self .shard .wait_for_draining (mongos_hosts )
928
+ logger .info ("Shard successfully drained storage." )
929
+
930
+ try :
931
+ # retries over a period of 10 minutes in an attempt to resolve race conditions it is
932
+ logger .debug ("Removing %s from replica set" , self .unit_host (self .unit ))
933
+ for attempt in Retrying (
934
+ stop = stop_after_attempt (600 ),
935
+ wait = wait_fixed (1 ),
936
+ reraise = True ,
937
+ ):
938
+ with attempt :
939
+ # in K8s we have the leader remove the unit from the replica set to reduce race
940
+ # conditions
941
+ if self .is_unit_in_replica_set ():
942
+ raise UnitStillInReplicaSet ()
943
+
944
+ except NotReadyError :
945
+ logger .info (
946
+ "Failed to remove %s from replica set, another member is syncing" , self .unit .name
947
+ )
948
+ except PyMongoError as e :
949
+ logger .error ("Failed to remove %s from replica set, error=%r" , self .unit .name , e )
950
+
936
951
def _on_update_status (self , event : UpdateStatusEvent ):
937
952
# user-made mistakes might result in other incorrect statues. Prioritise informing users of
938
953
# their mistake.
@@ -1206,6 +1221,17 @@ def _init_backup_user(self):
1206
1221
# END: user management
1207
1222
1208
1223
# BEGIN: helper functions
1224
+ def _update_related_hosts (self , event ) -> None :
1225
+ # app relations should be made aware of the new set of hosts
1226
+ try :
1227
+ if not self .is_role (Config .Role .SHARD ):
1228
+ self .client_relations .update_app_relation_data ()
1229
+ self .config_server .update_mongos_hosts ()
1230
+ self .cluster .update_config_server_db (event )
1231
+ except PyMongoError as e :
1232
+ logger .error ("Deferring on updating app relation data since: error: %r" , e )
1233
+ event .defer ()
1234
+ return
1209
1235
1210
1236
def _is_user_created (self , user : MongoDBUser ) -> bool :
1211
1237
return f"{ user .get_username ()} -user-created" in self .app_peer_data
@@ -1377,7 +1403,7 @@ def _add_units_from_replica_set(
1377
1403
mongo .add_replset_member (member )
1378
1404
1379
1405
def _remove_units_from_replica_set (
1380
- self , evemt , mongo : MongoDBConnection , units_to_remove : Set [str ]
1406
+ self , event , mongo : MongoDBConnection , units_to_remove : Set [str ]
1381
1407
) -> None :
1382
1408
for member in units_to_remove :
1383
1409
logger .debug ("Removing %s from the replica set" , member )
0 commit comments