@@ -49,10 +49,11 @@ func (c *controller) reconcileClusterMachineSafety(key string) error {
4949 var wg sync.WaitGroup
5050
5151 glog .V (3 ).Info ("SafetyCheck loop initializing" )
52- wg .Add (3 )
52+ wg .Add (2 )
5353 go c .checkAndFreezeORUnfreezeMachineSets (& wg )
5454 go c .checkVMObjects (& wg )
55- go c .checkAndFreezeMachineSetTimeout (& wg )
55+ //Disable permenant freeze for now. We should enable it again once we have sophisticated automatic unfreeze mechanism in place.
56+ //go c.checkAndFreezeMachineSetTimeout(&wg)
5657 wg .Wait ()
5758 c .machineSafetyQueue .AddAfter ("" , 60 * time .Second )
5859
@@ -63,10 +64,11 @@ func (c *controller) reconcileClusterMachineSafety(key string) error {
6364// which have much greater than desired number of replicas of machine objects
6465func (c * controller ) checkAndFreezeORUnfreezeMachineSets (wg * sync.WaitGroup ) {
6566
67+ defer wg .Done ()
68+
6669 machineSets , err := c .machineSetLister .List (labels .Everything ())
6770 if err != nil {
6871 glog .Error ("Safety-Net: Error getting machineSets - " , err )
69- wg .Done ()
7072 return
7173 }
7274
@@ -75,14 +77,13 @@ func (c *controller) checkAndFreezeORUnfreezeMachineSets(wg *sync.WaitGroup) {
7577 filteredMachines , err := c .machineLister .List (labels .Everything ())
7678 if err != nil {
7779 glog .Error ("Safety-Net: Error getting machines - " , err )
78- wg .Done ()
7980 return
8081 }
8182 fullyLabeledReplicasCount := int32 (0 )
8283 templateLabel := labels .Set (machineSet .Spec .Template .Labels ).AsSelectorPreValidated ()
8384 for _ , machine := range filteredMachines {
8485 if templateLabel .Matches (labels .Set (machine .Labels )) &&
85- len (machine .OwnerReferences ) = = 1 &&
86+ len (machine .OwnerReferences ) > = 1 &&
8687 machine .OwnerReferences [0 ].Name == machineSet .Name {
8788 fullyLabeledReplicasCount ++
8889 }
@@ -103,9 +104,7 @@ func (c *controller) checkAndFreezeORUnfreezeMachineSets(wg *sync.WaitGroup) {
103104 true ,
104105 )
105106 if err != nil {
106- //TODO explore if we can log/annotate this machineset and continue here.
107107 glog .Error ("Safety-Net: Error getting surge value - " , err )
108- wg .Done ()
109108 return
110109 }
111110
@@ -133,13 +132,13 @@ func (c *controller) checkAndFreezeORUnfreezeMachineSets(wg *sync.WaitGroup) {
133132 c .freezeMachineSetsAndDeployments (machineSet , OverShootingReplicaCount , message )
134133
135134 } else if machineSet .Labels ["freeze" ] == "True" &&
136- machineSet .Status .Conditions != nil &&
137- GetCondition (& machineSet .Status , v1alpha1 .MachineSetFrozen ).Reason == OverShootingReplicaCount &&
135+ //TODO: Reintroduce this checks once we have automated unfreeze for MachinTimeout aka meltdown.
136+ //machineSet.Status.Conditions != nil &&
137+ //GetCondition(&machineSet.Status, v1alpha1.MachineSetFrozen).Reason == OverShootingReplicaCount &&
138138 fullyLabeledReplicasCount <= lowerThreshold {
139139 c .unfreezeMachineSetsAndDeployments (machineSet )
140140 }
141141 }
142- wg .Done ()
143142}
144143
145144// checkVMObjects checks for orphan VMs (VMs that don't have a machine object backing)
@@ -456,6 +455,8 @@ func (c *controller) freezeMachineSetsAndDeployments(machineSet *v1alpha1.Machin
456455 glog .V (2 ).Infof ("Freezing MachineSet %q due to %q" , machineSet .Name , reason )
457456
458457 for {
458+ // TODO: Replace it with better retry logic. Replace all occurrences similarly.
459+ // Ref: https://github.com/kubernetes/kubernetes/blob/master/pkg/controller/deployment/util/replicaset_util.go#L35
459460 // Get the latest version of the machineSet so that we can avoid conflicts
460461 machineSet , err := c .controlMachineClient .MachineSets (machineSet .Namespace ).Get (machineSet .Name , metav1.GetOptions {})
461462 if err != nil {
0 commit comments