kubernetes-sigs
diff --git a/‎azure/scope/machinepoolmachine.go‎
Lines changed: 175 additions & 20 deletions b/‎azure/scope/machinepoolmachine.go‎
Lines changed: 175 additions & 20 deletions
diff --git a/‎config/crd/bases/infrastructure.cluster.x-k8s.io_azuremachinepools.yaml‎
Lines changed: 3 additions & 0 deletions b/‎config/crd/bases/infrastructure.cluster.x-k8s.io_azuremachinepools.yaml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎exp/api/v1alpha3/azuremachinepool_conversion.go‎
Lines changed: 4 additions & 0 deletions b/‎exp/api/v1alpha3/azuremachinepool_conversion.go‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎exp/api/v1alpha3/zz_generated.conversion.go‎
Lines changed: 1 addition & 0 deletions b/‎exp/api/v1alpha3/zz_generated.conversion.go‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎exp/api/v1alpha4/azuremachinepool_types.go‎
Lines changed: 6 additions & 0 deletions b/‎exp/api/v1alpha4/azuremachinepool_types.go‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎exp/api/v1alpha4/zz_generated.deepcopy.go‎
Lines changed: 9 additions & 3 deletions b/‎exp/api/v1alpha4/zz_generated.deepcopy.go‎
Lines changed: 9 additions & 3 deletions
diff --git a/‎exp/controllers/azuremachinepoolmachine_controller.go‎
Lines changed: 6 additions & 2 deletions b/‎exp/controllers/azuremachinepoolmachine_controller.go‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎test/e2e/azure_lb.go‎
Lines changed: 1 addition & 1 deletion b/‎test/e2e/azure_lb.go‎
Lines changed: 1 addition & 1 deletion
@@ -18,19 +18,25 @@ package scope
 
 import (
 	"context"
+	"fmt"
 	"reflect"
+	"time"
 
 	"github.com/go-logr/logr"
 	"github.com/pkg/errors"
 	corev1 "k8s.io/api/core/v1"
 	apierrors "k8s.io/apimachinery/pkg/api/errors"
-	"k8s.io/client-go/tools/clientcmd"
+	"k8s.io/client-go/kubernetes"
+	"k8s.io/klog/v2"
 	"k8s.io/klog/v2/klogr"
 	"k8s.io/utils/pointer"
+	clusterv1 "sigs.k8s.io/cluster-api/api/v1alpha4"
 	"sigs.k8s.io/cluster-api/controllers/noderefutil"
+	"sigs.k8s.io/cluster-api/controllers/remote"
 	capierrors "sigs.k8s.io/cluster-api/errors"
 	capiv1exp "sigs.k8s.io/cluster-api/exp/api/v1alpha4"
-	utilkubeconfig "sigs.k8s.io/cluster-api/util/kubeconfig"
+	drain "sigs.k8s.io/cluster-api/third_party/kubernetes-drain"
+	"sigs.k8s.io/cluster-api/util/conditions"
 	"sigs.k8s.io/cluster-api/util/patch"
 	"sigs.k8s.io/controller-runtime/pkg/client"
 
@@ -40,6 +46,11 @@ import (
 	"sigs.k8s.io/cluster-api-provider-azure/util/tele"
 )
 
+const (
+	// MachinePoolMachineScopeName is the sourceName, or more specifically the UserAgent, of client used in cordon and drain.
+	MachinePoolMachineScopeName = "azuremachinepoolmachine-scope"
+)
+
 type (
 	nodeGetter interface {
 		GetNodeByProviderID(ctx context.Context, providerID string) (*corev1.Node, error)
@@ -261,6 +272,158 @@ func (s *MachinePoolMachineScope) UpdateStatus(ctx context.Context) error {
 	return nil
 }
 
+// CordonAndDrain will cordon and drain the Kubernetes node associated with this AzureMachinePoolMachine.
+func (s *MachinePoolMachineScope) CordonAndDrain(ctx context.Context) error {
+	ctx, span := tele.Tracer().Start(ctx, "scope.MachinePoolMachineScope.CordonAndDrain")
+	defer span.End()
+
+	var (
+		nodeRef = s.AzureMachinePoolMachine.Status.NodeRef
+		node    *corev1.Node
+		err     error
+	)
+	if nodeRef == nil || nodeRef.Name == "" {
+		node, err = s.workloadNodeGetter.GetNodeByProviderID(ctx, s.ProviderID())
+	} else {
+		node, err = s.workloadNodeGetter.GetNodeByObjectReference(ctx, *nodeRef)
+	}
+
+	if err != nil && apierrors.IsNotFound(err) {
+		return nil // node was already gone, so no need to cordon and drain
+	} else if err != nil {
+		return errors.Wrap(err, "failed to find node")
+	}
+
+	// Drain node before deletion and issue a patch in order to make this operation visible to the users.
+	if s.isNodeDrainAllowed() {
+		patchHelper, err := patch.NewHelper(s.AzureMachinePoolMachine, s.client)
+		if err != nil {
+			return errors.Wrap(err, "failed to build a patchHelper when draining node")
+		}
+
+		s.V(4).Info("Draining node", "node", s.AzureMachinePoolMachine.Status.NodeRef.Name)
+		// The DrainingSucceededCondition never exists before the node is drained for the first time,
+		// so its transition time can be used to record the first time draining.
+		// This `if` condition prevents the transition time to be changed more than once.
+		if conditions.Get(s.AzureMachinePoolMachine, clusterv1.DrainingSucceededCondition) == nil {
+			conditions.MarkFalse(s.AzureMachinePoolMachine, clusterv1.DrainingSucceededCondition, clusterv1.DrainingReason, clusterv1.ConditionSeverityInfo, "Draining the node before deletion")
+		}
+
+		if err := patchHelper.Patch(ctx, s.AzureMachinePoolMachine); err != nil {
+			return errors.Wrap(err, "failed to patch AzureMachinePoolMachine")
+		}
+
+		if err := s.drainNode(ctx, node); err != nil {
+			// Check for condition existence. If the condition exists, it may have a different severity or message, which
+			// would cause the last transition time to be updated. The last transition time is used to determine how
+			// long to wait to timeout the node drain operation. If we were to keep updating the last transition time,
+			// a drain operation may never timeout.
+			if conditions.Get(s.AzureMachinePoolMachine, clusterv1.DrainingSucceededCondition) == nil {
+				conditions.MarkFalse(s.AzureMachinePoolMachine, clusterv1.DrainingSucceededCondition, clusterv1.DrainingFailedReason, clusterv1.ConditionSeverityWarning, err.Error())
+			}
+			return err
+		}
+
+		conditions.MarkTrue(s.AzureMachinePoolMachine, clusterv1.DrainingSucceededCondition)
+	}
+
+	return nil
+}
+
+func (s *MachinePoolMachineScope) drainNode(ctx context.Context, node *corev1.Node) error {
+	ctx, span := tele.Tracer().Start(ctx, "scope.MachinePoolMachineScope.drainNode")
+	defer span.End()
+
+	restConfig, err := remote.RESTConfig(ctx, MachinePoolMachineScopeName, s.client, client.ObjectKey{
+		Name:      s.ClusterName(),
+		Namespace: s.AzureMachinePoolMachine.Namespace,
+	})
+
+	if err != nil {
+		s.Error(err, "Error creating a remote client while deleting Machine, won't retry")
+		return nil
+	}
+
+	kubeClient, err := kubernetes.NewForConfig(restConfig)
+	if err != nil {
+		s.Error(err, "Error creating a remote client while deleting Machine, won't retry")
+		return nil
+	}
+
+	drainer := &drain.Helper{
+		Client:              kubeClient,
+		Force:               true,
+		IgnoreAllDaemonSets: true,
+		DeleteLocalData:     true,
+		GracePeriodSeconds:  -1,
+		// If a pod is not evicted in 20 seconds, retry the eviction next time the
+		// machine gets reconciled again (to allow other machines to be reconciled).
+		Timeout: 20 * time.Second,
+		OnPodDeletedOrEvicted: func(pod *corev1.Pod, usingEviction bool) {
+			verbStr := "Deleted"
+			if usingEviction {
+				verbStr = "Evicted"
+			}
+			s.V(4).Info(fmt.Sprintf("%s pod from Node", verbStr),
+				"pod", fmt.Sprintf("%s/%s", pod.Name, pod.Namespace))
+		},
+		Out:    writer{klog.Info},
+		ErrOut: writer{klog.Error},
+		DryRun: false,
+	}
+
+	if noderefutil.IsNodeUnreachable(node) {
+		// When the node is unreachable and some pods are not evicted for as long as this timeout, we ignore them.
+		drainer.SkipWaitForDeleteTimeoutSeconds = 60 * 5 // 5 minutes
+	}
+
+	if err := drain.RunCordonOrUncordon(ctx, drainer, node, true); err != nil {
+		// Machine will be re-reconciled after a cordon failure.
+		return azure.WithTransientError(errors.Errorf("unable to cordon node %s: %v", node.Name, err), 20*time.Second)
+	}
+
+	if err := drain.RunNodeDrain(ctx, drainer, node.Name); err != nil {
+		// Machine will be re-reconciled after a drain failure.
+		return azure.WithTransientError(errors.Wrap(err, "Drain failed, retry in 20s"), 20*time.Second)
+	}
+
+	s.V(4).Info("Drain successful")
+	return nil
+}
+
+// isNodeDrainAllowed checks to see the node is excluded from draining or if the NodeDrainTimeout has expired.
+func (s *MachinePoolMachineScope) isNodeDrainAllowed() bool {
+	if _, exists := s.AzureMachinePoolMachine.ObjectMeta.Annotations[clusterv1.ExcludeNodeDrainingAnnotation]; exists {
+		return false
+	}
+
+	if s.nodeDrainTimeoutExceeded() {
+		return false
+	}
+
+	return true
+}
+
+// nodeDrainTimeoutExceeded will check to see if the AzureMachinePool's NodeDrainTimeout is exceeded for the
+// AzureMachinePoolMachine.
+func (s *MachinePoolMachineScope) nodeDrainTimeoutExceeded() bool {
+	// if the NodeDrainTineout type is not set by user
+	pool := s.AzureMachinePool
+	if pool == nil || pool.Spec.NodeDrainTimeout == nil || pool.Spec.NodeDrainTimeout.Seconds() <= 0 {
+		return false
+	}
+
+	// if the draining succeeded condition does not exist
+	if conditions.Get(s.AzureMachinePoolMachine, clusterv1.DrainingSucceededCondition) == nil {
+		return false
+	}
+
+	now := time.Now()
+	firstTimeDrain := conditions.GetLastTransitionTime(s.AzureMachinePoolMachine, clusterv1.DrainingSucceededCondition)
+	diff := now.Sub(firstTimeDrain.Time)
+	return diff.Seconds() >= s.AzureMachinePool.Spec.NodeDrainTimeout.Seconds()
+}
+
 func (s *MachinePoolMachineScope) hasLatestModelApplied() (bool, error) {
 	if s.instance == nil {
 		return false, errors.New("instance must not be nil")
@@ -344,24 +507,16 @@ func getWorkloadClient(ctx context.Context, c client.Client, cluster client.Obje
 	ctx, span := tele.Tracer().Start(ctx, "scope.MachinePoolMachineScope.getWorkloadClient")
 	defer span.End()
 
-	obj := client.ObjectKey{
-		Namespace: cluster.Namespace,
-		Name:      cluster.Name,
-	}
-	dataBytes, err := utilkubeconfig.FromSecret(ctx, c, obj)
-	if err != nil {
-		return nil, errors.Wrapf(err, "\"%s-kubeconfig\" not found in namespace %q", obj.Name, obj.Namespace)
-	}
-
-	config, err := clientcmd.Load(dataBytes)
-	if err != nil {
-		return nil, errors.Wrapf(err, "failed to load \"%s-kubeconfig\" in namespace %q", obj.Name, obj.Namespace)
-	}
+	return remote.NewClusterClient(ctx, MachinePoolMachineScopeName, c, cluster)
+}
 
-	restConfig, err := clientcmd.NewDefaultClientConfig(*config, &clientcmd.ConfigOverrides{}).ClientConfig()
-	if err != nil {
-		return nil, errors.Wrapf(err, "failed transform config \"%s-kubeconfig\" in namespace %q", obj.Name, obj.Namespace)
-	}
+// writer implements io.Writer interface as a pass-through for klog.
+type writer struct {
+	logFunc func(args ...interface{})
+}
 
-	return client.New(restConfig, client.Options{})
+// Write passes string(p) into writer's logFunc and always returns len(p).
+func (w writer) Write(p []byte) (n int, err error) {
+	w.logFunc(string(p))
+	return len(p), nil
 }
@@ -462,6 +462,9 @@ spec:
               location:
                 description: Location is the Azure region location e.g. westus2
                 type: string
+              nodeDrainTimeout:
+                description: 'NodeDrainTimeout is the total amount of time that the controller will spend on draining a node. The default value is 0, meaning that the node can be drained without any time limitations. NOTE: NodeDrainTimeout is different from `kubectl drain --timeout`'
+                type: string
               providerID:
                 description: ProviderID is the identification ID of the Virtual Machine Scale Set
                 type: string
 
@@ -56,6 +56,10 @@ func (src *AzureMachinePool) ConvertTo(dstRaw conversion.Hub) error { // nolint
 		dst.Spec.Strategy.RollingUpdate.DeletePolicy = restored.Spec.Strategy.RollingUpdate.DeletePolicy
 	}
 
+	if restored.Spec.NodeDrainTimeout != nil {
+		dst.Spec.NodeDrainTimeout = restored.Spec.NodeDrainTimeout
+	}
+
 	if restored.Status.Image != nil {
 		dst.Status.Image = restored.Status.Image
 	}
 
@@ -134,6 +134,12 @@ type (
 		// +optional
 		// +kubebuilder:default={type: "RollingUpdate", rollingUpdate: {maxSurge: 1, maxUnavailable: 0, deletePolicy: Oldest}}
 		Strategy AzureMachinePoolDeploymentStrategy `json:"strategy,omitempty"`
+
+		// NodeDrainTimeout is the total amount of time that the controller will spend on draining a node.
+		// The default value is 0, meaning that the node can be drained without any time limitations.
+		// NOTE: NodeDrainTimeout is different from `kubectl drain --timeout`
+		// +optional
+		NodeDrainTimeout *metav1.Duration `json:"nodeDrainTimeout,omitempty"`
 	}
 
 	// AzureMachinePoolDeploymentStrategyType is the type of deployment strategy employed to rollout a new version of
 
@@ -377,8 +377,12 @@ func (r *azureMachinePoolMachineReconciler) Delete(ctx context.Context) error {
 		}
 	}()
 
-	err := r.scalesetVMsService.Delete(ctx)
-	if err != nil {
+	// cordon and drain stuff
+	if err := r.Scope.CordonAndDrain(ctx); err != nil {
+		return errors.Wrap(err, "failed to cordon and drain the scalesetVMs")
+	}
+
+	if err := r.scalesetVMsService.Delete(ctx); err != nil {
 		return errors.Wrap(err, "failed to reconcile scalesetVMs")
 	}
 
 
@@ -78,7 +78,7 @@ func AzureLBSpec(ctx context.Context, inputGetter func() AzureLBSpecInput) {
 		deploymentName = "web-windows" + util.RandomString(6)
 	}
 
-	webDeployment := deploymentBuilder.CreateDeployment("httpd", deploymentName, corev1.NamespaceDefault)
+	webDeployment := deploymentBuilder.Create("httpd", deploymentName, corev1.NamespaceDefault)
 	webDeployment.AddContainerPort("http", "http", 80, corev1.ProtocolTCP)
 
 	if input.Windows {
Original file line number	Diff line number	Diff line change
`@@ -56,6 +56,10 @@ func (src *AzureMachinePool) ConvertTo(dstRaw conversion.Hub) error { // nolint`
`56`	`56`	`dst.Spec.Strategy.RollingUpdate.DeletePolicy = restored.Spec.Strategy.RollingUpdate.DeletePolicy`
`57`	`57`	`}`
`58`	`58`
	`59`	`+ if restored.Spec.NodeDrainTimeout != nil {`
	`60`	`+ dst.Spec.NodeDrainTimeout = restored.Spec.NodeDrainTimeout`
	`61`	`+ }`
	`62`	`+`
`59`	`63`	`if restored.Status.Image != nil {`
`60`	`64`	`dst.Status.Image = restored.Status.Image`
`61`	`65`	`}`
Original file line number	Diff line number	Diff line change
`@@ -377,8 +377,12 @@ func (r *azureMachinePoolMachineReconciler) Delete(ctx context.Context) error {`
`377`	`377`	`}`
`378`	`378`	`}()`
`379`	`379`
`380`		`- err := r.scalesetVMsService.Delete(ctx)`
`381`		`- if err != nil {`
	`380`	`+ // cordon and drain stuff`
	`381`	`+ if err := r.Scope.CordonAndDrain(ctx); err != nil {`
	`382`	`+ return errors.Wrap(err, "failed to cordon and drain the scalesetVMs")`
	`383`	`+ }`
	`384`	`+`
	`385`	`+ if err := r.scalesetVMsService.Delete(ctx); err != nil {`
`382`	`386`	`return errors.Wrap(err, "failed to reconcile scalesetVMs")`
`383`	`387`	`}`
`384`	`388`
Original file line number	Diff line number	Diff line change
`@@ -78,7 +78,7 @@ func AzureLBSpec(ctx context.Context, inputGetter func() AzureLBSpecInput) {`
`78`	`78`	`deploymentName = "web-windows" + util.RandomString(6)`
`79`	`79`	`}`
`80`	`80`
`81`		`- webDeployment := deploymentBuilder.CreateDeployment("httpd", deploymentName, corev1.NamespaceDefault)`
	`81`	`+ webDeployment := deploymentBuilder.Create("httpd", deploymentName, corev1.NamespaceDefault)`
`82`	`82`	`webDeployment.AddContainerPort("http", "http", 80, corev1.ProtocolTCP)`
`83`	`83`
`84`	`84`	`if input.Windows {`