@@ -36,11 +36,10 @@ import (
3636const (
3737 // OverShootingReplicaCount freeze reason when replica count overshoots
3838 OverShootingReplicaCount = "OverShootingReplicaCount"
39+ // MachineDeploymentStateSync freeze reason when machineDeployment was found with inconsistent state
40+ MachineDeploymentStateSync = "MachineDeploymentStateSync"
3941 // TimeoutOccurred freeze reason when machineSet timeout occurs
4042 TimeoutOccurred = "MachineSetTimeoutOccurred"
41- // LastReplicaUpdate contains the last timestamp when the
42- // number of replicas was changed
43- LastReplicaUpdate = "safety.machine.sapcloud.io/lastreplicaupdate"
4443 // UnfreezeAnnotation indicates the controllers to unfreeze this object
4544 UnfreezeAnnotation = "safety.machine.sapcloud.io/unfreeze"
4645)
@@ -74,6 +73,12 @@ func (c *controller) reconcileClusterMachineSafetyOvershooting(key string) error
7473 }
7574 cache .WaitForCacheSync (stopCh , c .machineSetSynced , c .machineDeploymentSynced )
7675
76+ err = c .syncMachineDeploymentFreezeState ()
77+ if err != nil {
78+ glog .Errorf ("SafetyController: %v" , err )
79+ }
80+ cache .WaitForCacheSync (stopCh , c .machineDeploymentSynced )
81+
7782 err = c .unfreezeMachineDeploymentsWithUnfreezeAnnotation ()
7883 if err != nil {
7984 glog .Errorf ("SafetyController: %v" , err )
@@ -201,7 +206,7 @@ func (c *controller) unfreezeMachineDeploymentsWithUnfreezeAnnotation() error {
201206 if _ , exists := machineDeployment .Annotations [UnfreezeAnnotation ]; exists {
202207 glog .V (2 ).Infof ("SafetyController: UnFreezing MachineDeployment %q due to setting unfreeze annotation" , machineDeployment .Name )
203208
204- err := c .unfreezeMachineDeployment (machineDeployment )
209+ err := c .unfreezeMachineDeployment (machineDeployment , "UnfreezeAnnotation" )
205210 if err != nil {
206211 return err
207212 }
@@ -257,6 +262,60 @@ func (c *controller) unfreezeMachineSetsWithUnfreezeAnnotation() error {
257262 return nil
258263}
259264
265+ // syncMachineDeploymentFreezeState syncs freeze labels and conditions to keep it consistent
266+ func (c * controller ) syncMachineDeploymentFreezeState () error {
267+ machineDeployments , err := c .machineDeploymentLister .List (labels .Everything ())
268+ if err != nil {
269+ glog .Error ("SafetyController: Error while trying to LIST machineDeployments - " , err )
270+ return err
271+ }
272+
273+ for _ , machineDeployment := range machineDeployments {
274+
275+ machineDeploymentFreezeLabelPresent := (machineDeployment .Labels ["freeze" ] == "True" )
276+ machineDeploymentFrozenConditionPresent := (GetMachineDeploymentCondition (machineDeployment .Status , v1alpha1 .MachineDeploymentFrozen ) != nil )
277+
278+ machineDeploymentHasFrozenMachineSet := false
279+ machineSets , err := c .getMachineSetsForMachineDeployment (machineDeployment )
280+ if err == nil {
281+ for _ , machineSet := range machineSets {
282+ machineSetFreezeLabelPresent := (machineSet .Labels ["freeze" ] == "True" )
283+ machineSetFrozenConditionPresent := (GetCondition (& machineSet .Status , v1alpha1 .MachineSetFrozen ) != nil )
284+
285+ if machineSetFreezeLabelPresent || machineSetFrozenConditionPresent {
286+ machineDeploymentHasFrozenMachineSet = true
287+ break
288+ }
289+ }
290+ }
291+
292+ if machineDeploymentHasFrozenMachineSet {
293+ // If machineDeployment has atleast one frozen machine set backing it
294+
295+ if ! machineDeploymentFreezeLabelPresent || ! machineDeploymentFrozenConditionPresent {
296+ // Either the freeze label or freeze condition is not present on the machineDeployment
297+ message := "MachineDeployment State was inconsistent, hence safety controller has fixed this and frozen it"
298+ err := c .freezeMachineDeployment (machineDeployment , MachineDeploymentStateSync , message )
299+ if err != nil {
300+ return err
301+ }
302+ }
303+ } else {
304+ // If machineDeployment has no frozen machine set backing it
305+
306+ if machineDeploymentFreezeLabelPresent || machineDeploymentFrozenConditionPresent {
307+ // Either the freeze label or freeze condition is present present on the machineDeployment
308+ err := c .unfreezeMachineDeployment (machineDeployment , MachineDeploymentStateSync )
309+ if err != nil {
310+ return err
311+ }
312+ }
313+ }
314+ }
315+
316+ return nil
317+ }
318+
260319// checkAndFreezeORUnfreezeMachineSets freezes/unfreezes machineSets/machineDeployments
261320// which have much greater than desired number of replicas of machine objects
262321func (c * controller ) checkAndFreezeORUnfreezeMachineSets () error {
@@ -552,21 +611,6 @@ func (c *controller) checkMachineClass(
552611 }
553612}
554613
555- // addMachineSetToSafety enqueues into machineSafetyQueue when a new machineSet is added
556- func (c * controller ) addMachineSetToSafety (obj interface {}) {
557- machineSet := obj .(* v1alpha1.MachineSet )
558- c .updateTimeStamp (machineSet )
559- }
560-
561- // updateMachineSetToSafety adds update timestamp
562- func (c * controller ) updateMachineSetToSafety (old , new interface {}) {
563- oldMS := old .(* v1alpha1.MachineSet )
564- newMS := new .(* v1alpha1.MachineSet )
565- if oldMS .Spec .Replicas != newMS .Spec .Replicas {
566- c .updateTimeStamp (newMS )
567- }
568- }
569-
570614// addMachineToSafety enqueues into machineSafetyQueue when a new machine is added
571615func (c * controller ) addMachineToSafety (obj interface {}) {
572616 machine := obj .(* v1alpha1.Machine )
@@ -579,31 +623,6 @@ func (c *controller) deleteMachineToSafety(obj interface{}) {
579623 c .enqueueMachineSafetyOrphanVMsKey (machine )
580624}
581625
582- // updateTimeStamp adds an annotation indicating the last time the number of replicas
583- // of machineSet was changed
584- func (c * controller ) updateTimeStamp (ms * v1alpha1.MachineSet ) {
585- for {
586- // Get the latest version of the machineSet so that we can avoid conflicts
587- ms , err := c .controlMachineClient .MachineSets (ms .Namespace ).Get (ms .Name , metav1.GetOptions {})
588- if err != nil {
589- // Some error occurred while fetching object from API server
590- glog .Error (err )
591- break
592- }
593- clone := ms .DeepCopy ()
594- if clone .Annotations == nil {
595- clone .Annotations = make (map [string ]string )
596- }
597- clone .Annotations [LastReplicaUpdate ] = metav1 .Now ().Format ("2006-01-02 15:04:05 MST" )
598- _ , err = c .controlMachineClient .MachineSets (clone .Namespace ).Update (clone )
599- if err == nil {
600- break
601- }
602- // Keep retrying until update goes through
603- glog .Error ("SafetyController: Failure while trying to UPDATE timestamp, Error: " , err )
604- }
605- }
606-
607626// enqueueMachineSafetyOvershootingKey enqueues into machineSafetyOvershootingQueue
608627func (c * controller ) enqueueMachineSafetyOvershootingKey (obj interface {}) {
609628 c .machineSafetyOvershootingQueue .Add ("" )
@@ -680,37 +699,10 @@ func (c *controller) freezeMachineSetAndDeployment(machineSet *v1alpha1.MachineS
680699 if len (machineDeployments ) >= 1 {
681700 machineDeployment := machineDeployments [0 ]
682701 if machineDeployment != nil {
683-
684- // Get the latest version of the machineDeployment so that we can avoid conflicts
685- machineDeployment , err := c .controlMachineClient .MachineDeployments (machineDeployment .Namespace ).Get (machineDeployment .Name , metav1.GetOptions {})
686- if err != nil {
687- glog .Errorf ("SafetyController: Failed to GET machineDeployment. Error: %s" , err )
688- return err
689- }
690-
691- clone := machineDeployment .DeepCopy ()
692- newStatus := clone .Status
693- mdcond := NewMachineDeploymentCondition (v1alpha1 .MachineDeploymentFrozen , v1alpha1 .ConditionTrue , reason , message )
694- SetMachineDeploymentCondition (& newStatus , * mdcond )
695- clone .Status = newStatus
696- machineDeployment , err = c .controlMachineClient .MachineDeployments (clone .Namespace ).UpdateStatus (clone )
697- if err != nil {
698- glog .Errorf ("SafetyController: MachineDeployment/status UPDATE failed. Error: %s" , err )
699- return err
700- }
701-
702- clone = machineDeployment .DeepCopy ()
703- if clone .Labels == nil {
704- clone .Labels = make (map [string ]string )
705- }
706- clone .Labels ["freeze" ] = "True"
707- _ , err = c .controlMachineClient .MachineDeployments (clone .Namespace ).Update (clone )
702+ err := c .freezeMachineDeployment (machineDeployment , reason , message )
708703 if err != nil {
709- glog .Errorf ("SafetyController: MachineDeployment UPDATE failed. Error: %s" , err )
710704 return err
711705 }
712-
713- glog .V (2 ).Infof ("SafetyController: Froze MachineDeployment %q due to overshooting of replicas" , machineSet .Name )
714706 }
715707 }
716708
@@ -726,7 +718,7 @@ func (c *controller) unfreezeMachineSetAndDeployment(machineSet *v1alpha1.Machin
726718 machineDeployments := c .getMachineDeploymentsForMachineSet (machineSet )
727719 if len (machineDeployments ) >= 1 {
728720 machineDeployment := machineDeployments [0 ]
729- err := c .unfreezeMachineDeployment (machineDeployment )
721+ err := c .unfreezeMachineDeployment (machineDeployment , "UnderShootingReplicaCount" )
730722 if err != nil {
731723 return err
732724 }
@@ -786,8 +778,43 @@ func (c *controller) unfreezeMachineSet(machineSet *v1alpha1.MachineSet) error {
786778 return nil
787779}
788780
781+ // freezeMachineDeployment freezes the machineDeployment
782+ func (c * controller ) freezeMachineDeployment (machineDeployment * v1alpha1.MachineDeployment , reason string , message string ) error {
783+ // Get the latest version of the machineDeployment so that we can avoid conflicts
784+ machineDeployment , err := c .controlMachineClient .MachineDeployments (machineDeployment .Namespace ).Get (machineDeployment .Name , metav1.GetOptions {})
785+ if err != nil {
786+ glog .Errorf ("SafetyController: Failed to GET machineDeployment. Error: %s" , err )
787+ return err
788+ }
789+
790+ clone := machineDeployment .DeepCopy ()
791+ newStatus := clone .Status
792+ mdcond := NewMachineDeploymentCondition (v1alpha1 .MachineDeploymentFrozen , v1alpha1 .ConditionTrue , reason , message )
793+ SetMachineDeploymentCondition (& newStatus , * mdcond )
794+ clone .Status = newStatus
795+ machineDeployment , err = c .controlMachineClient .MachineDeployments (clone .Namespace ).UpdateStatus (clone )
796+ if err != nil {
797+ glog .Errorf ("SafetyController: MachineDeployment/status UPDATE failed. Error: %s" , err )
798+ return err
799+ }
800+
801+ clone = machineDeployment .DeepCopy ()
802+ if clone .Labels == nil {
803+ clone .Labels = make (map [string ]string )
804+ }
805+ clone .Labels ["freeze" ] = "True"
806+ _ , err = c .controlMachineClient .MachineDeployments (clone .Namespace ).Update (clone )
807+ if err != nil {
808+ glog .Errorf ("SafetyController: MachineDeployment UPDATE failed. Error: %s" , err )
809+ return err
810+ }
811+
812+ glog .V (2 ).Infof ("SafetyController: Froze MachineDeployment %q due to %s" , machineDeployment .Name , reason )
813+ return nil
814+ }
815+
789816// unfreezeMachineDeployment unfreezes the machineDeployment
790- func (c * controller ) unfreezeMachineDeployment (machineDeployment * v1alpha1.MachineDeployment ) error {
817+ func (c * controller ) unfreezeMachineDeployment (machineDeployment * v1alpha1.MachineDeployment , reason string ) error {
791818
792819 if machineDeployment == nil {
793820 err := fmt .Errorf ("SafetyController: Machine Deployment not passed" )
@@ -828,6 +855,6 @@ func (c *controller) unfreezeMachineDeployment(machineDeployment *v1alpha1.Machi
828855 return err
829856 }
830857
831- glog .V (2 ).Infof ("SafetyController: Unfroze MachineDeployment %q" , machineDeployment .Name )
858+ glog .V (2 ).Infof ("SafetyController: Unfroze MachineDeployment %q due to %s " , machineDeployment .Name , reason )
832859 return nil
833860}
0 commit comments