Skip to content

Commit f9e82c9

Browse files
prashanth26hardikdr
authored andcommitted
Bugfix/machinedeployment freeze sync (#264)
* Machine deployment freeze state synchronizer Machine deployment was sometimes found in a partial freeze state where either of the label or status was missing. This change fixes this by creating a machine deployment freeze state synchronizer * Added tests for machine deployment freeze state synchronizer * Removed LastUpdateTimestamp for machine sets This field/functionality was no longer being used and hence being removed from the MCM code
1 parent b8b56a5 commit f9e82c9

File tree

4 files changed

+219
-82
lines changed

4 files changed

+219
-82
lines changed

pkg/controller/controller.go

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ import (
3131
machinelisters "github.com/gardener/machine-controller-manager/pkg/client/listers/machine/v1alpha1"
3232
"github.com/golang/glog"
3333
"github.com/prometheus/client_golang/prometheus"
34-
"k8s.io/api/core/v1"
34+
v1 "k8s.io/api/core/v1"
3535
coreinformers "k8s.io/client-go/informers/core/v1"
3636
v1core "k8s.io/client-go/kubernetes/typed/core/v1"
3737
corelisters "k8s.io/client-go/listers/core/v1"
@@ -372,11 +372,6 @@ func NewController(
372372
DeleteFunc: controller.deleteMachineToSafety,
373373
})
374374

375-
machineSetInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
376-
AddFunc: controller.addMachineSetToSafety,
377-
UpdateFunc: controller.updateMachineSetToSafety,
378-
})
379-
380375
return controller, nil
381376
}
382377

pkg/controller/deployment_util.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -396,7 +396,6 @@ var annotationsToSkip = map[string]bool{
396396
RevisionHistoryAnnotation: true,
397397
DesiredReplicasAnnotation: true,
398398
MaxReplicasAnnotation: true,
399-
LastReplicaUpdate: true,
400399
PreferNoScheduleKey: true,
401400
UnfreezeAnnotation: true,
402401
}

pkg/controller/machine_safety.go

Lines changed: 102 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -36,11 +36,10 @@ import (
3636
const (
3737
// OverShootingReplicaCount freeze reason when replica count overshoots
3838
OverShootingReplicaCount = "OverShootingReplicaCount"
39+
// MachineDeploymentStateSync freeze reason when machineDeployment was found with inconsistent state
40+
MachineDeploymentStateSync = "MachineDeploymentStateSync"
3941
// TimeoutOccurred freeze reason when machineSet timeout occurs
4042
TimeoutOccurred = "MachineSetTimeoutOccurred"
41-
// LastReplicaUpdate contains the last timestamp when the
42-
// number of replicas was changed
43-
LastReplicaUpdate = "safety.machine.sapcloud.io/lastreplicaupdate"
4443
// UnfreezeAnnotation indicates the controllers to unfreeze this object
4544
UnfreezeAnnotation = "safety.machine.sapcloud.io/unfreeze"
4645
)
@@ -74,6 +73,12 @@ func (c *controller) reconcileClusterMachineSafetyOvershooting(key string) error
7473
}
7574
cache.WaitForCacheSync(stopCh, c.machineSetSynced, c.machineDeploymentSynced)
7675

76+
err = c.syncMachineDeploymentFreezeState()
77+
if err != nil {
78+
glog.Errorf("SafetyController: %v", err)
79+
}
80+
cache.WaitForCacheSync(stopCh, c.machineDeploymentSynced)
81+
7782
err = c.unfreezeMachineDeploymentsWithUnfreezeAnnotation()
7883
if err != nil {
7984
glog.Errorf("SafetyController: %v", err)
@@ -201,7 +206,7 @@ func (c *controller) unfreezeMachineDeploymentsWithUnfreezeAnnotation() error {
201206
if _, exists := machineDeployment.Annotations[UnfreezeAnnotation]; exists {
202207
glog.V(2).Infof("SafetyController: UnFreezing MachineDeployment %q due to setting unfreeze annotation", machineDeployment.Name)
203208

204-
err := c.unfreezeMachineDeployment(machineDeployment)
209+
err := c.unfreezeMachineDeployment(machineDeployment, "UnfreezeAnnotation")
205210
if err != nil {
206211
return err
207212
}
@@ -257,6 +262,60 @@ func (c *controller) unfreezeMachineSetsWithUnfreezeAnnotation() error {
257262
return nil
258263
}
259264

265+
// syncMachineDeploymentFreezeState syncs freeze labels and conditions to keep it consistent
266+
func (c *controller) syncMachineDeploymentFreezeState() error {
267+
machineDeployments, err := c.machineDeploymentLister.List(labels.Everything())
268+
if err != nil {
269+
glog.Error("SafetyController: Error while trying to LIST machineDeployments - ", err)
270+
return err
271+
}
272+
273+
for _, machineDeployment := range machineDeployments {
274+
275+
machineDeploymentFreezeLabelPresent := (machineDeployment.Labels["freeze"] == "True")
276+
machineDeploymentFrozenConditionPresent := (GetMachineDeploymentCondition(machineDeployment.Status, v1alpha1.MachineDeploymentFrozen) != nil)
277+
278+
machineDeploymentHasFrozenMachineSet := false
279+
machineSets, err := c.getMachineSetsForMachineDeployment(machineDeployment)
280+
if err == nil {
281+
for _, machineSet := range machineSets {
282+
machineSetFreezeLabelPresent := (machineSet.Labels["freeze"] == "True")
283+
machineSetFrozenConditionPresent := (GetCondition(&machineSet.Status, v1alpha1.MachineSetFrozen) != nil)
284+
285+
if machineSetFreezeLabelPresent || machineSetFrozenConditionPresent {
286+
machineDeploymentHasFrozenMachineSet = true
287+
break
288+
}
289+
}
290+
}
291+
292+
if machineDeploymentHasFrozenMachineSet {
293+
// If machineDeployment has atleast one frozen machine set backing it
294+
295+
if !machineDeploymentFreezeLabelPresent || !machineDeploymentFrozenConditionPresent {
296+
// Either the freeze label or freeze condition is not present on the machineDeployment
297+
message := "MachineDeployment State was inconsistent, hence safety controller has fixed this and frozen it"
298+
err := c.freezeMachineDeployment(machineDeployment, MachineDeploymentStateSync, message)
299+
if err != nil {
300+
return err
301+
}
302+
}
303+
} else {
304+
// If machineDeployment has no frozen machine set backing it
305+
306+
if machineDeploymentFreezeLabelPresent || machineDeploymentFrozenConditionPresent {
307+
// Either the freeze label or freeze condition is present present on the machineDeployment
308+
err := c.unfreezeMachineDeployment(machineDeployment, MachineDeploymentStateSync)
309+
if err != nil {
310+
return err
311+
}
312+
}
313+
}
314+
}
315+
316+
return nil
317+
}
318+
260319
// checkAndFreezeORUnfreezeMachineSets freezes/unfreezes machineSets/machineDeployments
261320
// which have much greater than desired number of replicas of machine objects
262321
func (c *controller) checkAndFreezeORUnfreezeMachineSets() error {
@@ -552,21 +611,6 @@ func (c *controller) checkMachineClass(
552611
}
553612
}
554613

555-
// addMachineSetToSafety enqueues into machineSafetyQueue when a new machineSet is added
556-
func (c *controller) addMachineSetToSafety(obj interface{}) {
557-
machineSet := obj.(*v1alpha1.MachineSet)
558-
c.updateTimeStamp(machineSet)
559-
}
560-
561-
// updateMachineSetToSafety adds update timestamp
562-
func (c *controller) updateMachineSetToSafety(old, new interface{}) {
563-
oldMS := old.(*v1alpha1.MachineSet)
564-
newMS := new.(*v1alpha1.MachineSet)
565-
if oldMS.Spec.Replicas != newMS.Spec.Replicas {
566-
c.updateTimeStamp(newMS)
567-
}
568-
}
569-
570614
// addMachineToSafety enqueues into machineSafetyQueue when a new machine is added
571615
func (c *controller) addMachineToSafety(obj interface{}) {
572616
machine := obj.(*v1alpha1.Machine)
@@ -579,31 +623,6 @@ func (c *controller) deleteMachineToSafety(obj interface{}) {
579623
c.enqueueMachineSafetyOrphanVMsKey(machine)
580624
}
581625

582-
// updateTimeStamp adds an annotation indicating the last time the number of replicas
583-
// of machineSet was changed
584-
func (c *controller) updateTimeStamp(ms *v1alpha1.MachineSet) {
585-
for {
586-
// Get the latest version of the machineSet so that we can avoid conflicts
587-
ms, err := c.controlMachineClient.MachineSets(ms.Namespace).Get(ms.Name, metav1.GetOptions{})
588-
if err != nil {
589-
// Some error occurred while fetching object from API server
590-
glog.Error(err)
591-
break
592-
}
593-
clone := ms.DeepCopy()
594-
if clone.Annotations == nil {
595-
clone.Annotations = make(map[string]string)
596-
}
597-
clone.Annotations[LastReplicaUpdate] = metav1.Now().Format("2006-01-02 15:04:05 MST")
598-
_, err = c.controlMachineClient.MachineSets(clone.Namespace).Update(clone)
599-
if err == nil {
600-
break
601-
}
602-
// Keep retrying until update goes through
603-
glog.Error("SafetyController: Failure while trying to UPDATE timestamp, Error: ", err)
604-
}
605-
}
606-
607626
// enqueueMachineSafetyOvershootingKey enqueues into machineSafetyOvershootingQueue
608627
func (c *controller) enqueueMachineSafetyOvershootingKey(obj interface{}) {
609628
c.machineSafetyOvershootingQueue.Add("")
@@ -680,37 +699,10 @@ func (c *controller) freezeMachineSetAndDeployment(machineSet *v1alpha1.MachineS
680699
if len(machineDeployments) >= 1 {
681700
machineDeployment := machineDeployments[0]
682701
if machineDeployment != nil {
683-
684-
// Get the latest version of the machineDeployment so that we can avoid conflicts
685-
machineDeployment, err := c.controlMachineClient.MachineDeployments(machineDeployment.Namespace).Get(machineDeployment.Name, metav1.GetOptions{})
686-
if err != nil {
687-
glog.Errorf("SafetyController: Failed to GET machineDeployment. Error: %s", err)
688-
return err
689-
}
690-
691-
clone := machineDeployment.DeepCopy()
692-
newStatus := clone.Status
693-
mdcond := NewMachineDeploymentCondition(v1alpha1.MachineDeploymentFrozen, v1alpha1.ConditionTrue, reason, message)
694-
SetMachineDeploymentCondition(&newStatus, *mdcond)
695-
clone.Status = newStatus
696-
machineDeployment, err = c.controlMachineClient.MachineDeployments(clone.Namespace).UpdateStatus(clone)
697-
if err != nil {
698-
glog.Errorf("SafetyController: MachineDeployment/status UPDATE failed. Error: %s", err)
699-
return err
700-
}
701-
702-
clone = machineDeployment.DeepCopy()
703-
if clone.Labels == nil {
704-
clone.Labels = make(map[string]string)
705-
}
706-
clone.Labels["freeze"] = "True"
707-
_, err = c.controlMachineClient.MachineDeployments(clone.Namespace).Update(clone)
702+
err := c.freezeMachineDeployment(machineDeployment, reason, message)
708703
if err != nil {
709-
glog.Errorf("SafetyController: MachineDeployment UPDATE failed. Error: %s", err)
710704
return err
711705
}
712-
713-
glog.V(2).Infof("SafetyController: Froze MachineDeployment %q due to overshooting of replicas", machineSet.Name)
714706
}
715707
}
716708

@@ -726,7 +718,7 @@ func (c *controller) unfreezeMachineSetAndDeployment(machineSet *v1alpha1.Machin
726718
machineDeployments := c.getMachineDeploymentsForMachineSet(machineSet)
727719
if len(machineDeployments) >= 1 {
728720
machineDeployment := machineDeployments[0]
729-
err := c.unfreezeMachineDeployment(machineDeployment)
721+
err := c.unfreezeMachineDeployment(machineDeployment, "UnderShootingReplicaCount")
730722
if err != nil {
731723
return err
732724
}
@@ -786,8 +778,43 @@ func (c *controller) unfreezeMachineSet(machineSet *v1alpha1.MachineSet) error {
786778
return nil
787779
}
788780

781+
// freezeMachineDeployment freezes the machineDeployment
782+
func (c *controller) freezeMachineDeployment(machineDeployment *v1alpha1.MachineDeployment, reason string, message string) error {
783+
// Get the latest version of the machineDeployment so that we can avoid conflicts
784+
machineDeployment, err := c.controlMachineClient.MachineDeployments(machineDeployment.Namespace).Get(machineDeployment.Name, metav1.GetOptions{})
785+
if err != nil {
786+
glog.Errorf("SafetyController: Failed to GET machineDeployment. Error: %s", err)
787+
return err
788+
}
789+
790+
clone := machineDeployment.DeepCopy()
791+
newStatus := clone.Status
792+
mdcond := NewMachineDeploymentCondition(v1alpha1.MachineDeploymentFrozen, v1alpha1.ConditionTrue, reason, message)
793+
SetMachineDeploymentCondition(&newStatus, *mdcond)
794+
clone.Status = newStatus
795+
machineDeployment, err = c.controlMachineClient.MachineDeployments(clone.Namespace).UpdateStatus(clone)
796+
if err != nil {
797+
glog.Errorf("SafetyController: MachineDeployment/status UPDATE failed. Error: %s", err)
798+
return err
799+
}
800+
801+
clone = machineDeployment.DeepCopy()
802+
if clone.Labels == nil {
803+
clone.Labels = make(map[string]string)
804+
}
805+
clone.Labels["freeze"] = "True"
806+
_, err = c.controlMachineClient.MachineDeployments(clone.Namespace).Update(clone)
807+
if err != nil {
808+
glog.Errorf("SafetyController: MachineDeployment UPDATE failed. Error: %s", err)
809+
return err
810+
}
811+
812+
glog.V(2).Infof("SafetyController: Froze MachineDeployment %q due to %s", machineDeployment.Name, reason)
813+
return nil
814+
}
815+
789816
// unfreezeMachineDeployment unfreezes the machineDeployment
790-
func (c *controller) unfreezeMachineDeployment(machineDeployment *v1alpha1.MachineDeployment) error {
817+
func (c *controller) unfreezeMachineDeployment(machineDeployment *v1alpha1.MachineDeployment, reason string) error {
791818

792819
if machineDeployment == nil {
793820
err := fmt.Errorf("SafetyController: Machine Deployment not passed")
@@ -828,6 +855,6 @@ func (c *controller) unfreezeMachineDeployment(machineDeployment *v1alpha1.Machi
828855
return err
829856
}
830857

831-
glog.V(2).Infof("SafetyController: Unfroze MachineDeployment %q", machineDeployment.Name)
858+
glog.V(2).Infof("SafetyController: Unfroze MachineDeployment %q due to %s", machineDeployment.Name, reason)
832859
return nil
833860
}

0 commit comments

Comments
 (0)