Skip to content

Commit 91c52d5

Browse files
authored
Merge pull request #496 from hardikdr/feature/autoscaler-annotation
Inhibit scale-down by autoscaler during roll-outs.
2 parents 90f8b67 + f5a0478 commit 91c52d5

File tree

14 files changed

+1561
-33
lines changed

14 files changed

+1561
-33
lines changed

cmd/machine-controller-manager/app/controllermanager.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -273,6 +273,7 @@ func StartControllers(s *options.MCMServer,
273273
s.NodeConditions,
274274
s.BootstrapTokenAuthExtraGroups,
275275
s.DeleteMigratedMachineClass,
276+
s.AutoscalerScaleDownAnnotationDuringRollout,
276277
)
277278
if err != nil {
278279
return err

cmd/machine-controller-manager/app/options/options.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ func NewMCMServer() *MCMServer {
6262
KubeAPIBurst: 30,
6363
LeaderElection: leaderelectionconfig.DefaultLeaderElectionConfiguration(),
6464
ControllerStartInterval: metav1.Duration{Duration: 0 * time.Second},
65+
AutoscalerScaleDownAnnotationDuringRollout: true,
6566
SafetyOptions: machineconfig.SafetyOptions{
6667
SafetyUp: 2,
6768
SafetyDown: 1,
@@ -115,6 +116,8 @@ func (s *MCMServer) AddFlags(fs *pflag.FlagSet) {
115116
fs.StringVar(&s.BootstrapTokenAuthExtraGroups, "bootstrap-token-auth-extra-groups", s.BootstrapTokenAuthExtraGroups, "Comma-separated list of groups to set bootstrap token's \"auth-extra-groups\" field to")
116117
fs.BoolVar(&s.DeleteMigratedMachineClass, "delete-migrated-machine-class", false, "Deletes any (provider specific) machine class that has the machine.sapcloud.io/migrated annotation")
117118

119+
fs.BoolVar(&s.AutoscalerScaleDownAnnotationDuringRollout, "autoscaler-scaldown-annotation-during-rollout", true, "Add cluster autoscaler scale-down disabled annotation during roll-out.")
120+
118121
leaderelectionconfig.BindFlags(&s.LeaderElection, fs)
119122
// TODO: DefaultFeatureGate is global and it adds all k8s flags
120123
// utilfeature.DefaultFeatureGate.AddFlag(fs)

pkg/controller/controller.go

Lines changed: 31 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -81,32 +81,34 @@ func NewController(
8181
nodeConditions string,
8282
bootstrapTokenAuthExtraGroups string,
8383
deleteMigratedMachineClass bool,
84+
autoscalerScaleDownAnnotationDuringRollout bool,
8485
) (Controller, error) {
8586
controller := &controller{
86-
namespace: namespace,
87-
controlMachineClient: controlMachineClient,
88-
controlCoreClient: controlCoreClient,
89-
targetCoreClient: targetCoreClient,
90-
recorder: recorder,
91-
expectations: NewUIDTrackingContExpectations(NewContExpectations()),
92-
secretQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "secret"),
93-
nodeQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "node"),
94-
openStackMachineClassQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "openstackmachineclass"),
95-
awsMachineClassQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "awsmachineclass"),
96-
azureMachineClassQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "azuremachineclass"),
97-
gcpMachineClassQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "gcpmachineclass"),
98-
alicloudMachineClassQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "alicloudmachineclass"),
99-
packetMachineClassQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "packetmachineclass"),
100-
machineQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "machine"),
101-
machineSetQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "machineset"),
102-
machineDeploymentQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "machinedeployment"),
103-
machineSafetyOrphanVMsQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "machinesafetyorphanvms"),
104-
machineSafetyOvershootingQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "machinesafetyovershooting"),
105-
machineSafetyAPIServerQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "machinesafetyapiserver"),
106-
safetyOptions: safetyOptions,
107-
nodeConditions: nodeConditions,
108-
bootstrapTokenAuthExtraGroups: bootstrapTokenAuthExtraGroups,
109-
deleteMigratedMachineClass: deleteMigratedMachineClass,
87+
namespace: namespace,
88+
controlMachineClient: controlMachineClient,
89+
controlCoreClient: controlCoreClient,
90+
targetCoreClient: targetCoreClient,
91+
recorder: recorder,
92+
expectations: NewUIDTrackingContExpectations(NewContExpectations()),
93+
secretQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "secret"),
94+
nodeQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "node"),
95+
openStackMachineClassQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "openstackmachineclass"),
96+
awsMachineClassQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "awsmachineclass"),
97+
azureMachineClassQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "azuremachineclass"),
98+
gcpMachineClassQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "gcpmachineclass"),
99+
alicloudMachineClassQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "alicloudmachineclass"),
100+
packetMachineClassQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "packetmachineclass"),
101+
machineQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "machine"),
102+
machineSetQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "machineset"),
103+
machineDeploymentQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "machinedeployment"),
104+
machineSafetyOrphanVMsQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "machinesafetyorphanvms"),
105+
machineSafetyOvershootingQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "machinesafetyovershooting"),
106+
machineSafetyAPIServerQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "machinesafetyapiserver"),
107+
safetyOptions: safetyOptions,
108+
nodeConditions: nodeConditions,
109+
bootstrapTokenAuthExtraGroups: bootstrapTokenAuthExtraGroups,
110+
deleteMigratedMachineClass: deleteMigratedMachineClass,
111+
autoscalerScaleDownAnnotationDuringRollout: autoscalerScaleDownAnnotationDuringRollout,
110112
}
111113

112114
controller.internalExternalScheme = runtime.NewScheme()
@@ -398,10 +400,11 @@ type Controller interface {
398400

399401
// controller is a concrete Controller.
400402
type controller struct {
401-
namespace string
402-
nodeConditions string
403-
bootstrapTokenAuthExtraGroups string
404-
deleteMigratedMachineClass bool
403+
namespace string
404+
nodeConditions string
405+
bootstrapTokenAuthExtraGroups string
406+
deleteMigratedMachineClass bool
407+
autoscalerScaleDownAnnotationDuringRollout bool
405408

406409
controlMachineClient machineapi.MachineV1alpha1Interface
407410
controlCoreClient kubernetes.Interface

pkg/controller/controller_utils.go

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,10 +33,12 @@ import (
3333
"sync/atomic"
3434
"time"
3535

36+
"k8s.io/apimachinery/pkg/api/errors"
3637
"k8s.io/apimachinery/pkg/api/validation"
3738

3839
"github.com/gardener/machine-controller-manager/pkg/apis/machine/v1alpha1"
3940
machineapi "github.com/gardener/machine-controller-manager/pkg/client/clientset/versioned/typed/machine/v1alpha1"
41+
annotationsutils "github.com/gardener/machine-controller-manager/pkg/util/annotations"
4042
conditionutils "github.com/gardener/machine-controller-manager/pkg/util/conditions"
4143
hashutil "github.com/gardener/machine-controller-manager/pkg/util/hash"
4244
taintutils "github.com/gardener/machine-controller-manager/pkg/util/taints"
@@ -96,6 +98,13 @@ var Backoff = wait.Backoff{
9698
Jitter: 1.0,
9799
}
98100

101+
// UpdateAnnotationBackoff is the backoff period used while updating the annotation
102+
var UpdateAnnotationBackoff = wait.Backoff{
103+
Steps: 5,
104+
Duration: 100 * time.Millisecond,
105+
Jitter: 1.0,
106+
}
107+
99108
var (
100109
// KeyFunc is the variable that stores the function that retreives the object key from an object
101110
KeyFunc = cache.DeletionHandlingMetaNamespaceKeyFunc
@@ -1111,3 +1120,122 @@ func ComputeHash(template *v1alpha1.MachineTemplateSpec, collisionCount *int32)
11111120

11121121
return machineTemplateSpecHasher.Sum32()
11131122
}
1123+
1124+
// AddOrUpdateAnnotationOnNode add annotations to the node. If annotation was added into node, it'll issue API calls
1125+
// to update nodes; otherwise, no API calls. Return error if any.
1126+
func AddOrUpdateAnnotationOnNode(c clientset.Interface, nodeName string, annotations map[string]string) error {
1127+
if annotations == nil {
1128+
return nil
1129+
}
1130+
firstTry := true
1131+
return clientretry.RetryOnConflict(UpdateAnnotationBackoff, func() error {
1132+
var err error
1133+
var oldNode *v1.Node
1134+
// First we try getting node from the API server cache, as it's cheaper. If it fails
1135+
// we get it from etcd to be sure to have fresh data.
1136+
if firstTry {
1137+
oldNode, err = c.CoreV1().Nodes().Get(nodeName, metav1.GetOptions{ResourceVersion: "0"})
1138+
firstTry = false
1139+
} else {
1140+
oldNode, err = c.CoreV1().Nodes().Get(nodeName, metav1.GetOptions{})
1141+
}
1142+
if errors.IsNotFound(err) {
1143+
klog.Warningf("Node %s not found while updating annotation. Err: %v", nodeName, err)
1144+
return nil
1145+
}
1146+
if err != nil {
1147+
return err
1148+
}
1149+
1150+
var newNode *v1.Node
1151+
updated := false
1152+
1153+
newNode, updated, err = annotationsutils.AddOrUpdateAnnotation(oldNode, annotations)
1154+
1155+
if !updated {
1156+
return nil
1157+
}
1158+
return UpdateNodeAnnotations(c, nodeName, oldNode, newNode)
1159+
})
1160+
}
1161+
1162+
// UpdateNodeAnnotations is for updating the node annotations from oldNode to the newNode
1163+
// using the nodes Update() method
1164+
func UpdateNodeAnnotations(c clientset.Interface, nodeName string, oldNode *v1.Node, newNode *v1.Node) error {
1165+
newNodeClone := oldNode.DeepCopy()
1166+
newNodeClone.Annotations = newNode.Annotations
1167+
1168+
_, err := c.CoreV1().Nodes().Update(newNodeClone)
1169+
if err != nil {
1170+
return fmt.Errorf("failed to create or update annotations for node %q: %v", nodeName, err)
1171+
}
1172+
1173+
return err
1174+
}
1175+
1176+
// RemoveAnnotationsOffNode is for cleaning up annotations temporarily added to node,
1177+
// won't fail if target annotation doesn't exist or has been removed.
1178+
// If passed a node it'll check if there's anything to be done, if annotation is not present it won't issue
1179+
// any API calls.
1180+
func RemoveAnnotationsOffNode(c clientset.Interface, nodeName string, annotations map[string]string) error {
1181+
1182+
// Short circuit if annotation doesnt exist for limiting API calls.
1183+
if annotations == nil || nodeName == "" {
1184+
return nil
1185+
}
1186+
1187+
firstTry := true
1188+
return clientretry.RetryOnConflict(UpdateAnnotationBackoff, func() error {
1189+
var err error
1190+
var oldNode *v1.Node
1191+
// First we try getting node from the API server cache, as it's cheaper. If it fails
1192+
// we get it from etcd to be sure to have fresh data.
1193+
if firstTry {
1194+
oldNode, err = c.CoreV1().Nodes().Get(nodeName, metav1.GetOptions{ResourceVersion: "0"})
1195+
firstTry = false
1196+
} else {
1197+
oldNode, err = c.CoreV1().Nodes().Get(nodeName, metav1.GetOptions{})
1198+
}
1199+
if errors.IsNotFound(err) {
1200+
klog.Warningf("Node %s not found while removing annotation. Err: %v", nodeName, err)
1201+
return nil
1202+
}
1203+
1204+
if err != nil {
1205+
return err
1206+
}
1207+
1208+
var newNode *v1.Node
1209+
oldNodeCopy := oldNode
1210+
updated := false
1211+
1212+
// Remove the annotations from the node.
1213+
newNode, updated, err = annotationsutils.RemoveAnnotation(oldNodeCopy, annotations)
1214+
1215+
if !updated {
1216+
return nil
1217+
}
1218+
return UpdateNodeAnnotations(c, nodeName, oldNode, newNode)
1219+
})
1220+
}
1221+
1222+
// GetAnnotationsFromNode returns all the annotations of the provided node.
1223+
func GetAnnotationsFromNode(c clientset.Interface, nodeName string) (map[string]string, error) {
1224+
1225+
// Short circuit if annotation doesnt exist for limiting API calls.
1226+
if nodeName == "" {
1227+
return nil, nil
1228+
}
1229+
1230+
node, err := c.CoreV1().Nodes().Get(nodeName, metav1.GetOptions{})
1231+
if errors.IsNotFound(err) {
1232+
klog.Warningf("Node %s not found while fetching annotation. Err: %v", nodeName, err)
1233+
return nil, nil
1234+
}
1235+
1236+
if err != nil {
1237+
return nil, err
1238+
}
1239+
1240+
return node.Annotations, nil
1241+
}

0 commit comments

Comments
 (0)