diff --git a/cmd/clusterawsadm/cloudformation/bootstrap/cluster_api_controller.go b/cmd/clusterawsadm/cloudformation/bootstrap/cluster_api_controller.go index 0569eaac85..2b74d5507e 100644 --- a/cmd/clusterawsadm/cloudformation/bootstrap/cluster_api_controller.go +++ b/cmd/clusterawsadm/cloudformation/bootstrap/cluster_api_controller.go @@ -198,6 +198,7 @@ func (t Template) ControllersPolicy() *iamv1.PolicyDocument { "arn:*:autoscaling:*:*:autoScalingGroup:*:autoScalingGroupName/*", }, Action: iamv1.Actions{ + "autoscaling:CancelInstanceRefresh", "autoscaling:CreateAutoScalingGroup", "autoscaling:UpdateAutoScalingGroup", "autoscaling:CreateOrUpdateTags", diff --git a/cmd/clusterawsadm/cloudformation/bootstrap/fixtures/customsuffix.yaml b/cmd/clusterawsadm/cloudformation/bootstrap/fixtures/customsuffix.yaml index 9d1b3549a3..b7c87b71f3 100644 --- a/cmd/clusterawsadm/cloudformation/bootstrap/fixtures/customsuffix.yaml +++ b/cmd/clusterawsadm/cloudformation/bootstrap/fixtures/customsuffix.yaml @@ -254,6 +254,7 @@ Resources: Resource: - '*' - Action: + - autoscaling:CancelInstanceRefresh - autoscaling:CreateAutoScalingGroup - autoscaling:UpdateAutoScalingGroup - autoscaling:CreateOrUpdateTags diff --git a/cmd/clusterawsadm/cloudformation/bootstrap/fixtures/default.yaml b/cmd/clusterawsadm/cloudformation/bootstrap/fixtures/default.yaml index 42b0764b62..0c504b481a 100644 --- a/cmd/clusterawsadm/cloudformation/bootstrap/fixtures/default.yaml +++ b/cmd/clusterawsadm/cloudformation/bootstrap/fixtures/default.yaml @@ -254,6 +254,7 @@ Resources: Resource: - '*' - Action: + - autoscaling:CancelInstanceRefresh - autoscaling:CreateAutoScalingGroup - autoscaling:UpdateAutoScalingGroup - autoscaling:CreateOrUpdateTags diff --git a/cmd/clusterawsadm/cloudformation/bootstrap/fixtures/with_all_secret_backends.yaml b/cmd/clusterawsadm/cloudformation/bootstrap/fixtures/with_all_secret_backends.yaml index 980fb5389e..154f643157 100644 --- a/cmd/clusterawsadm/cloudformation/bootstrap/fixtures/with_all_secret_backends.yaml +++ b/cmd/clusterawsadm/cloudformation/bootstrap/fixtures/with_all_secret_backends.yaml @@ -260,6 +260,7 @@ Resources: Resource: - '*' - Action: + - autoscaling:CancelInstanceRefresh - autoscaling:CreateAutoScalingGroup - autoscaling:UpdateAutoScalingGroup - autoscaling:CreateOrUpdateTags diff --git a/cmd/clusterawsadm/cloudformation/bootstrap/fixtures/with_allow_assume_role.yaml b/cmd/clusterawsadm/cloudformation/bootstrap/fixtures/with_allow_assume_role.yaml index 155c66210a..7855f46153 100644 --- a/cmd/clusterawsadm/cloudformation/bootstrap/fixtures/with_allow_assume_role.yaml +++ b/cmd/clusterawsadm/cloudformation/bootstrap/fixtures/with_allow_assume_role.yaml @@ -254,6 +254,7 @@ Resources: Resource: - '*' - Action: + - autoscaling:CancelInstanceRefresh - autoscaling:CreateAutoScalingGroup - autoscaling:UpdateAutoScalingGroup - autoscaling:CreateOrUpdateTags diff --git a/cmd/clusterawsadm/cloudformation/bootstrap/fixtures/with_bootstrap_user.yaml b/cmd/clusterawsadm/cloudformation/bootstrap/fixtures/with_bootstrap_user.yaml index d8fde6d6b5..e7aed8d3b3 100644 --- a/cmd/clusterawsadm/cloudformation/bootstrap/fixtures/with_bootstrap_user.yaml +++ b/cmd/clusterawsadm/cloudformation/bootstrap/fixtures/with_bootstrap_user.yaml @@ -260,6 +260,7 @@ Resources: Resource: - '*' - Action: + - autoscaling:CancelInstanceRefresh - autoscaling:CreateAutoScalingGroup - autoscaling:UpdateAutoScalingGroup - autoscaling:CreateOrUpdateTags diff --git a/cmd/clusterawsadm/cloudformation/bootstrap/fixtures/with_custom_bootstrap_user.yaml b/cmd/clusterawsadm/cloudformation/bootstrap/fixtures/with_custom_bootstrap_user.yaml index f42a9b0588..e9f9070bf1 100644 --- a/cmd/clusterawsadm/cloudformation/bootstrap/fixtures/with_custom_bootstrap_user.yaml +++ b/cmd/clusterawsadm/cloudformation/bootstrap/fixtures/with_custom_bootstrap_user.yaml @@ -260,6 +260,7 @@ Resources: Resource: - '*' - Action: + - autoscaling:CancelInstanceRefresh - autoscaling:CreateAutoScalingGroup - autoscaling:UpdateAutoScalingGroup - autoscaling:CreateOrUpdateTags diff --git a/cmd/clusterawsadm/cloudformation/bootstrap/fixtures/with_different_instance_profiles.yaml b/cmd/clusterawsadm/cloudformation/bootstrap/fixtures/with_different_instance_profiles.yaml index 4635dcb7da..3afefa2636 100644 --- a/cmd/clusterawsadm/cloudformation/bootstrap/fixtures/with_different_instance_profiles.yaml +++ b/cmd/clusterawsadm/cloudformation/bootstrap/fixtures/with_different_instance_profiles.yaml @@ -254,6 +254,7 @@ Resources: Resource: - '*' - Action: + - autoscaling:CancelInstanceRefresh - autoscaling:CreateAutoScalingGroup - autoscaling:UpdateAutoScalingGroup - autoscaling:CreateOrUpdateTags diff --git a/cmd/clusterawsadm/cloudformation/bootstrap/fixtures/with_eks_console.yaml b/cmd/clusterawsadm/cloudformation/bootstrap/fixtures/with_eks_console.yaml index 9bbfc7db2d..b27c5b6674 100644 --- a/cmd/clusterawsadm/cloudformation/bootstrap/fixtures/with_eks_console.yaml +++ b/cmd/clusterawsadm/cloudformation/bootstrap/fixtures/with_eks_console.yaml @@ -254,6 +254,7 @@ Resources: Resource: - '*' - Action: + - autoscaling:CancelInstanceRefresh - autoscaling:CreateAutoScalingGroup - autoscaling:UpdateAutoScalingGroup - autoscaling:CreateOrUpdateTags diff --git a/cmd/clusterawsadm/cloudformation/bootstrap/fixtures/with_eks_default_roles.yaml b/cmd/clusterawsadm/cloudformation/bootstrap/fixtures/with_eks_default_roles.yaml index 6ea929e3d8..3096d4404b 100644 --- a/cmd/clusterawsadm/cloudformation/bootstrap/fixtures/with_eks_default_roles.yaml +++ b/cmd/clusterawsadm/cloudformation/bootstrap/fixtures/with_eks_default_roles.yaml @@ -254,6 +254,7 @@ Resources: Resource: - '*' - Action: + - autoscaling:CancelInstanceRefresh - autoscaling:CreateAutoScalingGroup - autoscaling:UpdateAutoScalingGroup - autoscaling:CreateOrUpdateTags diff --git a/cmd/clusterawsadm/cloudformation/bootstrap/fixtures/with_eks_disable.yaml b/cmd/clusterawsadm/cloudformation/bootstrap/fixtures/with_eks_disable.yaml index b0416eba8c..8f47148b16 100644 --- a/cmd/clusterawsadm/cloudformation/bootstrap/fixtures/with_eks_disable.yaml +++ b/cmd/clusterawsadm/cloudformation/bootstrap/fixtures/with_eks_disable.yaml @@ -254,6 +254,7 @@ Resources: Resource: - '*' - Action: + - autoscaling:CancelInstanceRefresh - autoscaling:CreateAutoScalingGroup - autoscaling:UpdateAutoScalingGroup - autoscaling:CreateOrUpdateTags diff --git a/cmd/clusterawsadm/cloudformation/bootstrap/fixtures/with_eks_kms_prefix.yaml b/cmd/clusterawsadm/cloudformation/bootstrap/fixtures/with_eks_kms_prefix.yaml index 70e27777fa..c092580a60 100644 --- a/cmd/clusterawsadm/cloudformation/bootstrap/fixtures/with_eks_kms_prefix.yaml +++ b/cmd/clusterawsadm/cloudformation/bootstrap/fixtures/with_eks_kms_prefix.yaml @@ -254,6 +254,7 @@ Resources: Resource: - '*' - Action: + - autoscaling:CancelInstanceRefresh - autoscaling:CreateAutoScalingGroup - autoscaling:UpdateAutoScalingGroup - autoscaling:CreateOrUpdateTags diff --git a/cmd/clusterawsadm/cloudformation/bootstrap/fixtures/with_extra_statements.yaml b/cmd/clusterawsadm/cloudformation/bootstrap/fixtures/with_extra_statements.yaml index 15334f4107..ec1cb2743b 100644 --- a/cmd/clusterawsadm/cloudformation/bootstrap/fixtures/with_extra_statements.yaml +++ b/cmd/clusterawsadm/cloudformation/bootstrap/fixtures/with_extra_statements.yaml @@ -260,6 +260,7 @@ Resources: Resource: - '*' - Action: + - autoscaling:CancelInstanceRefresh - autoscaling:CreateAutoScalingGroup - autoscaling:UpdateAutoScalingGroup - autoscaling:CreateOrUpdateTags diff --git a/cmd/clusterawsadm/cloudformation/bootstrap/fixtures/with_s3_bucket.yaml b/cmd/clusterawsadm/cloudformation/bootstrap/fixtures/with_s3_bucket.yaml index e255832c06..71cbc5778a 100644 --- a/cmd/clusterawsadm/cloudformation/bootstrap/fixtures/with_s3_bucket.yaml +++ b/cmd/clusterawsadm/cloudformation/bootstrap/fixtures/with_s3_bucket.yaml @@ -254,6 +254,7 @@ Resources: Resource: - '*' - Action: + - autoscaling:CancelInstanceRefresh - autoscaling:CreateAutoScalingGroup - autoscaling:UpdateAutoScalingGroup - autoscaling:CreateOrUpdateTags diff --git a/cmd/clusterawsadm/cloudformation/bootstrap/fixtures/with_ssm_secret_backend.yaml b/cmd/clusterawsadm/cloudformation/bootstrap/fixtures/with_ssm_secret_backend.yaml index a82726d157..f0e8d57025 100644 --- a/cmd/clusterawsadm/cloudformation/bootstrap/fixtures/with_ssm_secret_backend.yaml +++ b/cmd/clusterawsadm/cloudformation/bootstrap/fixtures/with_ssm_secret_backend.yaml @@ -254,6 +254,7 @@ Resources: Resource: - '*' - Action: + - autoscaling:CancelInstanceRefresh - autoscaling:CreateAutoScalingGroup - autoscaling:UpdateAutoScalingGroup - autoscaling:CreateOrUpdateTags diff --git a/exp/controllers/awsmachinepool_controller.go b/exp/controllers/awsmachinepool_controller.go index 4963a6456b..b46d6dd7b8 100644 --- a/exp/controllers/awsmachinepool_controller.go +++ b/exp/controllers/awsmachinepool_controller.go @@ -22,6 +22,7 @@ import ( "fmt" "time" + autoscalingtypes "github.com/aws/aws-sdk-go-v2/service/autoscaling/types" "github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp/cmpopts" "github.com/pkg/errors" @@ -296,17 +297,21 @@ func (r *AWSMachinePoolReconciler) reconcileNormal(ctx context.Context, machineP return ctrl.Result{}, err } - canUpdateLaunchTemplate := func() (bool, error) { + canStartInstanceRefresh := func() (bool, *autoscalingtypes.InstanceRefreshStatus, error) { // If there is a change: before changing the template, check if there exist an ongoing instance refresh, // because only 1 instance refresh can be "InProgress". If template is updated when refresh cannot be started, // that change will not trigger a refresh. Do not start an instance refresh if only userdata changed. if asg == nil { // If the ASG hasn't been created yet, there is no need to check if we can start the instance refresh. // But we want to update the LaunchTemplate because an error in the LaunchTemplate may be blocking the ASG creation. - return true, nil + return true, nil, nil } return asgsvc.CanStartASGInstanceRefresh(machinePoolScope) } + cancelInstanceRefresh := func() error { + machinePoolScope.Info("cancelling instance refresh") + return asgsvc.CancelASGInstanceRefresh(machinePoolScope) + } runPostLaunchTemplateUpdateOperation := func() error { // skip instance refresh if ASG is not created yet if asg == nil { @@ -330,11 +335,15 @@ func (r *AWSMachinePoolReconciler) reconcileNormal(ctx context.Context, machineP machinePoolScope.Info("starting instance refresh", "number of instances", machinePoolScope.MachinePool.Spec.Replicas) return asgsvc.StartASGInstanceRefresh(machinePoolScope) } - if err := reconSvc.ReconcileLaunchTemplate(ctx, machinePoolScope, machinePoolScope, s3Scope, ec2Svc, objectStoreSvc, canUpdateLaunchTemplate, runPostLaunchTemplateUpdateOperation); err != nil { + res, err := reconSvc.ReconcileLaunchTemplate(ctx, machinePoolScope, machinePoolScope, s3Scope, ec2Svc, objectStoreSvc, canStartInstanceRefresh, cancelInstanceRefresh, runPostLaunchTemplateUpdateOperation) + if err != nil { r.Recorder.Eventf(machinePoolScope.AWSMachinePool, corev1.EventTypeWarning, "FailedLaunchTemplateReconcile", "Failed to reconcile launch template: %v", err) machinePoolScope.Error(err, "failed to reconcile launch template") return ctrl.Result{}, err } + if res != nil { + return *res, nil + } // set the LaunchTemplateReady condition conditions.MarkTrue(machinePoolScope.AWSMachinePool, expinfrav1.LaunchTemplateReadyCondition) diff --git a/exp/controllers/awsmachinepool_controller_test.go b/exp/controllers/awsmachinepool_controller_test.go index 02dfae1911..a315296f9f 100644 --- a/exp/controllers/awsmachinepool_controller_test.go +++ b/exp/controllers/awsmachinepool_controller_test.go @@ -24,6 +24,7 @@ import ( "fmt" "testing" + autoscalingtypes "github.com/aws/aws-sdk-go-v2/service/autoscaling/types" "github.com/aws/aws-sdk-go-v2/service/s3" "github.com/aws/aws-sdk-go/aws" "github.com/aws/aws-sdk-go/service/ec2" @@ -309,7 +310,7 @@ func TestAWSMachinePoolReconciler(t *testing.T) { getASG(t, g) expectedErr := errors.New("no connection available ") - reconSvc.EXPECT().ReconcileLaunchTemplate(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(expectedErr) + reconSvc.EXPECT().ReconcileLaunchTemplate(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(nil, expectedErr) _, err := reconciler.reconcileNormal(context.Background(), ms, cs, cs, cs) g.Expect(errors.Cause(err)).To(MatchError(expectedErr)) }) @@ -335,7 +336,7 @@ func TestAWSMachinePoolReconciler(t *testing.T) { Subnets: []string{}, } - reconSvc.EXPECT().ReconcileLaunchTemplate(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(nil) + reconSvc.EXPECT().ReconcileLaunchTemplate(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(nil, nil) asgSvc.EXPECT().GetASGByName(gomock.Any()).Return(asg, nil) asgSvc.EXPECT().DescribeLifecycleHooks(gomock.Any()).Return(nil, nil) asgSvc.EXPECT().SubnetIDs(gomock.Any()).Return([]string{}, nil) @@ -373,7 +374,7 @@ func TestAWSMachinePoolReconciler(t *testing.T) { Subnets: []string{}, } - reconSvc.EXPECT().ReconcileLaunchTemplate(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(nil) + reconSvc.EXPECT().ReconcileLaunchTemplate(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(nil, nil) asgSvc.EXPECT().GetASGByName(gomock.Any()).Return(asg, nil) ec2Svc.EXPECT().InstanceIfExists(aws.String("1")).Return(&infrav1.Instance{ID: "1", Type: "m6.2xlarge"}, nil) ec2Svc.EXPECT().InstanceIfExists(aws.String("2")).Return(&infrav1.Instance{ID: "2", Type: "m6.2xlarge"}, nil) @@ -474,7 +475,7 @@ func TestAWSMachinePoolReconciler(t *testing.T) { }, })).To(Succeed()) - reconSvc.EXPECT().ReconcileLaunchTemplate(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(nil) + reconSvc.EXPECT().ReconcileLaunchTemplate(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(nil, nil) asgSvc.EXPECT().GetASGByName(gomock.Any()).Return(asg, nil) asgSvc.EXPECT().DescribeLifecycleHooks(gomock.Any()).Return(nil, nil) asgSvc.EXPECT().SubnetIDs(gomock.Any()).Return([]string{}, nil) @@ -509,7 +510,7 @@ func TestAWSMachinePoolReconciler(t *testing.T) { defer teardown(t, g) setSuspendedProcesses(t, g) - reconSvc.EXPECT().ReconcileLaunchTemplate(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(nil) + reconSvc.EXPECT().ReconcileLaunchTemplate(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(nil, nil) asgSvc.EXPECT().GetASGByName(gomock.Any()).Return(nil, nil) asgSvc.EXPECT().CreateASG(gomock.Any()).Return(&expinfrav1.AutoScalingGroup{ Name: "name", @@ -533,7 +534,7 @@ func TestAWSMachinePoolReconciler(t *testing.T) { defer teardown(t, g) setSuspendedProcesses(t, g) ms.AWSMachinePool.Spec.SuspendProcesses.All = true - reconSvc.EXPECT().ReconcileLaunchTemplate(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(nil) + reconSvc.EXPECT().ReconcileLaunchTemplate(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(nil, nil) asgSvc.EXPECT().DescribeLifecycleHooks(gomock.Any()).Return(nil, nil) reconSvc.EXPECT().ReconcileTags(gomock.Any(), gomock.Any()).Return(nil) asgSvc.EXPECT().GetASGByName(gomock.Any()).Return(&expinfrav1.AutoScalingGroup{ @@ -574,7 +575,7 @@ func TestAWSMachinePoolReconciler(t *testing.T) { defer teardown(t, g) setSuspendedProcesses(t, g) - reconSvc.EXPECT().ReconcileLaunchTemplate(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(nil) + reconSvc.EXPECT().ReconcileLaunchTemplate(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(nil, nil) asgSvc.EXPECT().DescribeLifecycleHooks(gomock.Any()).Return(nil, nil) reconSvc.EXPECT().ReconcileTags(gomock.Any(), gomock.Any()).Return(nil) asgSvc.EXPECT().GetASGByName(gomock.Any()).Return(&expinfrav1.AutoScalingGroup{ @@ -600,7 +601,7 @@ func TestAWSMachinePoolReconciler(t *testing.T) { Name: "an-asg", DesiredCapacity: ptr.To[int32](1), } - reconSvc.EXPECT().ReconcileLaunchTemplate(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(nil) + reconSvc.EXPECT().ReconcileLaunchTemplate(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(nil, nil) asgSvc.EXPECT().DescribeLifecycleHooks(gomock.Any()).Return(nil, nil) asgSvc.EXPECT().GetASGByName(gomock.Any()).Return(&asg, nil) asgSvc.EXPECT().SubnetIDs(gomock.Any()).Return([]string{}, nil) @@ -641,7 +642,7 @@ func TestAWSMachinePoolReconciler(t *testing.T) { }, Subnets: []string{"subnet1", "subnet2"}, } - reconSvc.EXPECT().ReconcileLaunchTemplate(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(nil) + reconSvc.EXPECT().ReconcileLaunchTemplate(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(nil, nil) asgSvc.EXPECT().DescribeLifecycleHooks(gomock.Any()).Return(nil, nil) reconSvc.EXPECT().ReconcileTags(gomock.Any(), gomock.Any()).Return(nil) asgSvc.EXPECT().GetASGByName(gomock.Any()).Return(&asg, nil).AnyTimes() @@ -661,7 +662,7 @@ func TestAWSMachinePoolReconciler(t *testing.T) { MaxSize: int32(100), Subnets: []string{"subnet1", "subnet2"}, } - reconSvc.EXPECT().ReconcileLaunchTemplate(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(nil) + reconSvc.EXPECT().ReconcileLaunchTemplate(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(nil, nil) asgSvc.EXPECT().DescribeLifecycleHooks(gomock.Any()).Return(nil, nil) reconSvc.EXPECT().ReconcileTags(gomock.Any(), gomock.Any()).Return(nil) asgSvc.EXPECT().GetASGByName(gomock.Any()).Return(&asg, nil).AnyTimes() @@ -681,7 +682,7 @@ func TestAWSMachinePoolReconciler(t *testing.T) { MaxSize: int32(2), Subnets: []string{}, } - reconSvc.EXPECT().ReconcileLaunchTemplate(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(nil) + reconSvc.EXPECT().ReconcileLaunchTemplate(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(nil, nil) asgSvc.EXPECT().DescribeLifecycleHooks(gomock.Any()).Return(nil, nil) reconSvc.EXPECT().ReconcileTags(gomock.Any(), gomock.Any()).Return(nil) asgSvc.EXPECT().GetASGByName(gomock.Any()).Return(&asg, nil).AnyTimes() @@ -791,7 +792,7 @@ func TestAWSMachinePoolReconciler(t *testing.T) { nil) ec2Svc.EXPECT().DiscoverLaunchTemplateAMI(gomock.Any(), gomock.Any()).Return(ptr.To[string]("ami-different"), nil) ec2Svc.EXPECT().LaunchTemplateNeedsUpdate(gomock.Any(), gomock.Any(), gomock.Any()).Return(false, nil) - asgSvc.EXPECT().CanStartASGInstanceRefresh(gomock.Any()).Return(true, nil) + asgSvc.EXPECT().CanStartASGInstanceRefresh(gomock.Any()).Return(true, nil, nil) ec2Svc.EXPECT().PruneLaunchTemplateVersions(gomock.Any()).Return(nil, nil) ec2Svc.EXPECT().CreateLaunchTemplateVersion(gomock.Any(), gomock.Any(), gomock.Eq(ptr.To[string]("ami-different")), gomock.Eq(apimachinerytypes.NamespacedName{Namespace: "default", Name: "bootstrap-data"}), gomock.Any(), gomock.Any()).Return(nil) ec2Svc.EXPECT().GetLaunchTemplateLatestVersion(gomock.Any()).Return("2", nil) @@ -847,7 +848,7 @@ func TestAWSMachinePoolReconciler(t *testing.T) { nil) ec2Svc.EXPECT().DiscoverLaunchTemplateAMI(gomock.Any(), gomock.Any()).Return(ptr.To[string]("ami-existing"), nil) ec2Svc.EXPECT().LaunchTemplateNeedsUpdate(gomock.Any(), gomock.Any(), gomock.Any()).Return(false, nil) - asgSvc.EXPECT().CanStartASGInstanceRefresh(gomock.Any()).Return(true, nil) + asgSvc.EXPECT().CanStartASGInstanceRefresh(gomock.Any()).Return(true, nil, nil) ec2Svc.EXPECT().PruneLaunchTemplateVersions(gomock.Any()).Return(nil, nil) ec2Svc.EXPECT().CreateLaunchTemplateVersion(gomock.Any(), gomock.Any(), gomock.Eq(ptr.To[string]("ami-existing")), gomock.Eq(apimachinerytypes.NamespacedName{Namespace: "default", Name: "bootstrap-data"}), gomock.Any(), gomock.Any()).Return(nil) ec2Svc.EXPECT().GetLaunchTemplateLatestVersion(gomock.Any()).Return("2", nil) @@ -933,7 +934,7 @@ func TestAWSMachinePoolReconciler(t *testing.T) { nil) ec2Svc.EXPECT().DiscoverLaunchTemplateAMI(gomock.Any(), gomock.Any()).Return(ptr.To[string]("ami-existing"), nil) ec2Svc.EXPECT().LaunchTemplateNeedsUpdate(gomock.Any(), gomock.Any(), gomock.Any()).Return(false, nil) - asgSvc.EXPECT().CanStartASGInstanceRefresh(gomock.Any()).Return(true, nil) + asgSvc.EXPECT().CanStartASGInstanceRefresh(gomock.Any()).Return(true, nil, nil) ec2Svc.EXPECT().PruneLaunchTemplateVersions(gomock.Any()).Return(nil, nil) ec2Svc.EXPECT().CreateLaunchTemplateVersion(gomock.Any(), gomock.Any(), gomock.Eq(ptr.To[string]("ami-existing")), gomock.Eq(apimachinerytypes.NamespacedName{Namespace: "default", Name: "bootstrap-data-new"}), gomock.Any(), gomock.Any()).Return(nil) ec2Svc.EXPECT().GetLaunchTemplateLatestVersion(gomock.Any()).Return("2", nil) @@ -1025,15 +1026,19 @@ func TestAWSMachinePoolReconciler(t *testing.T) { return &s3.PutObjectOutput{}, nil }) - // Simulate a pending instance refresh - asgSvc.EXPECT().CanStartASGInstanceRefresh(gomock.Any()).Return(false, nil) + // Simulate a pending instance refresh here, to see if `CancelInstanceRefresh` gets called + instanceRefreshStatus := autoscalingtypes.InstanceRefreshStatusPending + asgSvc.EXPECT().CanStartASGInstanceRefresh(gomock.Any()).Return(false, &instanceRefreshStatus, nil) + asgSvc.EXPECT().CancelASGInstanceRefresh(gomock.Any()).Return(nil) - _, err := reconciler.reconcileNormal(context.Background(), ms, cs, cs, cs) - g.Expect(err).To(HaveOccurred()) - expectConditions(g, ms.AWSMachinePool, []conditionAssertion{{expinfrav1.PreLaunchTemplateUpdateCheckCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityWarning, expinfrav1.PreLaunchTemplateUpdateCheckFailedReason}}) + // First reconciliation should notice the existing instance refresh and cancel it. + // Since the cancellation is asynchronous, the controller should requeue. + res, err := reconciler.reconcileNormal(context.Background(), ms, cs, cs, cs) + g.Expect(res.RequeueAfter).To(BeNumerically(">", 0)) + g.Expect(err).To(Succeed()) - // Now simulate that no pending instance refresh exists - asgSvc.EXPECT().CanStartASGInstanceRefresh(gomock.Any()).Return(true, nil) + // Now simulate that no pending instance refresh exists. Cancellation should not be called anymore. + asgSvc.EXPECT().CanStartASGInstanceRefresh(gomock.Any()).Return(true, nil, nil) asgSvc.EXPECT().GetASGByName(gomock.Any()).DoAndReturn(func(scope *scope.MachinePoolScope) (*expinfrav1.AutoScalingGroup, error) { g.Expect(scope.Name()).To(Equal("test")) @@ -1071,6 +1076,9 @@ func TestAWSMachinePoolReconciler(t *testing.T) { return &s3.PutObjectOutput{}, nil }) + // No cancellation expected in this second reconciliation (see above) + asgSvc.EXPECT().CancelASGInstanceRefresh(gomock.Any()).Times(0) + var simulatedDeletedVersionNumber int64 = 777 bootstrapDataHash := "some-simulated-hash" ec2Svc.EXPECT().PruneLaunchTemplateVersions(gomock.Any()).Return(&ec2.LaunchTemplateVersion{ @@ -1188,7 +1196,7 @@ func TestAWSMachinePoolReconciler(t *testing.T) { } ms.AWSMachinePool.Spec.AWSLifecycleHooks = append(ms.AWSMachinePool.Spec.AWSLifecycleHooks, newLifecycleHook) - reconSvc.EXPECT().ReconcileLaunchTemplate(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(nil) + reconSvc.EXPECT().ReconcileLaunchTemplate(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(nil, nil) // New ASG must be created with lifecycle hooks (single AWS SDK call is enough) // @@ -1214,7 +1222,7 @@ func TestAWSMachinePoolReconciler(t *testing.T) { } ms.AWSMachinePool.Spec.AWSLifecycleHooks = append(ms.AWSMachinePool.Spec.AWSLifecycleHooks, newLifecycleHook) - reconSvc.EXPECT().ReconcileLaunchTemplate(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(nil) + reconSvc.EXPECT().ReconcileLaunchTemplate(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(nil, nil) asgSvc.EXPECT().DescribeLifecycleHooks(gomock.Eq(ms.Name())).Return(nil, nil) asgSvc.EXPECT().CreateLifecycleHook(gomock.Any(), ms.Name(), &newLifecycleHook).Return(nil) reconSvc.EXPECT().ReconcileTags(gomock.Any(), gomock.Any()).Return(nil) @@ -1244,7 +1252,7 @@ func TestAWSMachinePoolReconciler(t *testing.T) { setup(t, g) defer teardown(t, g) - reconSvc.EXPECT().ReconcileLaunchTemplate(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(nil) + reconSvc.EXPECT().ReconcileLaunchTemplate(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(nil, nil) asgSvc.EXPECT().DescribeLifecycleHooks(gomock.Eq(ms.Name())).Return([]*expinfrav1.AWSLifecycleHook{ { Name: "hook-to-remove", @@ -1287,7 +1295,7 @@ func TestAWSMachinePoolReconciler(t *testing.T) { } ms.AWSMachinePool.Spec.AWSLifecycleHooks = append(ms.AWSMachinePool.Spec.AWSLifecycleHooks, newLifecycleHook) - reconSvc.EXPECT().ReconcileLaunchTemplate(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(nil) + reconSvc.EXPECT().ReconcileLaunchTemplate(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(nil, nil) asgSvc.EXPECT().DescribeLifecycleHooks(gomock.Eq(ms.Name())).Return([]*expinfrav1.AWSLifecycleHook{ { Name: "hook-to-remove", @@ -1331,7 +1339,7 @@ func TestAWSMachinePoolReconciler(t *testing.T) { } ms.AWSMachinePool.Spec.AWSLifecycleHooks = append(ms.AWSMachinePool.Spec.AWSLifecycleHooks, updateLifecycleHook) - reconSvc.EXPECT().ReconcileLaunchTemplate(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(nil) + reconSvc.EXPECT().ReconcileLaunchTemplate(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(nil, nil) asgSvc.EXPECT().DescribeLifecycleHooks(gomock.Eq(ms.Name())).Return([]*expinfrav1.AWSLifecycleHook{ { Name: "hook-to-update", diff --git a/exp/controllers/awsmanagedmachinepool_controller.go b/exp/controllers/awsmanagedmachinepool_controller.go index 543a74f445..639aa6d5b8 100644 --- a/exp/controllers/awsmanagedmachinepool_controller.go +++ b/exp/controllers/awsmanagedmachinepool_controller.go @@ -20,6 +20,7 @@ import ( "context" "time" + autoscalingtypes "github.com/aws/aws-sdk-go-v2/service/autoscaling/types" "github.com/pkg/errors" corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" @@ -192,7 +193,7 @@ func (r *AWSManagedMachinePoolReconciler) Reconcile(ctx context.Context, req ctr return ctrl.Result{}, r.reconcileDelete(ctx, machinePoolScope, managedControlPlaneScope) } - return ctrl.Result{}, r.reconcileNormal(ctx, machinePoolScope, managedControlPlaneScope, managedControlPlaneScope) + return r.reconcileNormal(ctx, machinePoolScope, managedControlPlaneScope, managedControlPlaneScope) } func (r *AWSManagedMachinePoolReconciler) reconcileNormal( @@ -200,12 +201,12 @@ func (r *AWSManagedMachinePoolReconciler) reconcileNormal( machinePoolScope *scope.ManagedMachinePoolScope, ec2Scope scope.EC2Scope, s3Scope scope.S3Scope, -) error { +) (ctrl.Result, error) { machinePoolScope.Info("Reconciling AWSManagedMachinePool") if controllerutil.AddFinalizer(machinePoolScope.ManagedMachinePool, expinfrav1.ManagedMachinePoolFinalizer) { if err := machinePoolScope.PatchObject(); err != nil { - return err + return ctrl.Result{}, err } } @@ -214,18 +215,25 @@ func (r *AWSManagedMachinePoolReconciler) reconcileNormal( reconSvc := r.getReconcileService(ec2Scope) if machinePoolScope.ManagedMachinePool.Spec.AWSLaunchTemplate != nil { - canUpdateLaunchTemplate := func() (bool, error) { - return true, nil + canStartInstanceRefresh := func() (bool, *autoscalingtypes.InstanceRefreshStatus, error) { + return true, nil, nil + } + cancelInstanceRefresh := func() error { + return nil } runPostLaunchTemplateUpdateOperation := func() error { return nil } var objectStoreSvc services.ObjectStoreInterface // nil because no S3 bucket support for `AWSManagedControlPlane` yet - if err := reconSvc.ReconcileLaunchTemplate(ctx, machinePoolScope, machinePoolScope, s3Scope, ec2svc, objectStoreSvc, canUpdateLaunchTemplate, runPostLaunchTemplateUpdateOperation); err != nil { + res, err := reconSvc.ReconcileLaunchTemplate(ctx, machinePoolScope, machinePoolScope, s3Scope, ec2svc, objectStoreSvc, canStartInstanceRefresh, cancelInstanceRefresh, runPostLaunchTemplateUpdateOperation) + if err != nil { r.Recorder.Eventf(machinePoolScope.ManagedMachinePool, corev1.EventTypeWarning, "FailedLaunchTemplateReconcile", "Failed to reconcile launch template: %v", err) machinePoolScope.Error(err, "failed to reconcile launch template") conditions.MarkFalse(machinePoolScope.ManagedMachinePool, expinfrav1.LaunchTemplateReadyCondition, expinfrav1.LaunchTemplateReconcileFailedReason, clusterv1.ConditionSeverityError, "") - return err + return ctrl.Result{}, err + } + if res != nil { + return *res, nil } launchTemplateID := machinePoolScope.GetLaunchTemplateIDStatus() @@ -234,7 +242,7 @@ func (r *AWSManagedMachinePoolReconciler) reconcileNormal( ResourceService: ec2svc, }} if err := reconSvc.ReconcileTags(machinePoolScope, resourceServiceToUpdate); err != nil { - return errors.Wrap(err, "error updating tags") + return ctrl.Result{}, errors.Wrap(err, "error updating tags") } // set the LaunchTemplateReady condition @@ -242,10 +250,10 @@ func (r *AWSManagedMachinePoolReconciler) reconcileNormal( } if err := ekssvc.ReconcilePool(ctx); err != nil { - return errors.Wrapf(err, "failed to reconcile machine pool for AWSManagedMachinePool %s/%s", machinePoolScope.ManagedMachinePool.Namespace, machinePoolScope.ManagedMachinePool.Name) + return ctrl.Result{}, errors.Wrapf(err, "failed to reconcile machine pool for AWSManagedMachinePool %s/%s", machinePoolScope.ManagedMachinePool.Namespace, machinePoolScope.ManagedMachinePool.Name) } - return nil + return ctrl.Result{}, nil } func (r *AWSManagedMachinePoolReconciler) reconcileDelete( diff --git a/pkg/cloud/services/autoscaling/autoscalinggroup.go b/pkg/cloud/services/autoscaling/autoscalinggroup.go index 6a288615d9..a4f1f13b25 100644 --- a/pkg/cloud/services/autoscaling/autoscalinggroup.go +++ b/pkg/cloud/services/autoscaling/autoscalinggroup.go @@ -305,30 +305,50 @@ func (s *Service) UpdateASG(machinePoolScope *scope.MachinePoolScope) error { return nil } -// CanStartASGInstanceRefresh will start an ASG instance with refresh. -func (s *Service) CanStartASGInstanceRefresh(scope *scope.MachinePoolScope) (bool, error) { +// CanStartASGInstanceRefresh checks if a new ASG instance refresh can currently be started, and returns the status if there is an existing, unfinished refresh. +func (s *Service) CanStartASGInstanceRefresh(scope *scope.MachinePoolScope) (bool, *autoscalingtypes.InstanceRefreshStatus, error) { describeInput := &autoscaling.DescribeInstanceRefreshesInput{AutoScalingGroupName: aws.String(scope.Name())} refreshes, err := s.ASGClient.DescribeInstanceRefreshes(context.TODO(), describeInput) if err != nil { - return false, err - } - hasUnfinishedRefresh := false - if len(refreshes.InstanceRefreshes) != 0 { - for i := range refreshes.InstanceRefreshes { - if refreshes.InstanceRefreshes[i].Status == autoscalingtypes.InstanceRefreshStatusInProgress || - refreshes.InstanceRefreshes[i].Status == autoscalingtypes.InstanceRefreshStatusPending || - refreshes.InstanceRefreshes[i].Status == autoscalingtypes.InstanceRefreshStatusCancelling { - hasUnfinishedRefresh = true - } + return false, nil, err + } + var unfinishedRefreshStatus *autoscalingtypes.InstanceRefreshStatus + for _, refresh := range refreshes.InstanceRefreshes { + if refresh.Status == autoscalingtypes.InstanceRefreshStatusInProgress || + refresh.Status == autoscalingtypes.InstanceRefreshStatusPending || + refresh.Status == autoscalingtypes.InstanceRefreshStatusCancelling { + unfinishedRefreshStatus = &refresh.Status } } - if hasUnfinishedRefresh { - return false, nil + if unfinishedRefreshStatus != nil { + // There's an unfinished instance refresh, so no other refresh can be started right now + return false, unfinishedRefreshStatus, nil + } + return true, nil, nil +} + +// CancelASGInstanceRefresh cancels an ASG instance refresh. +func (s *Service) CancelASGInstanceRefresh(scope *scope.MachinePoolScope) error { + input := &autoscaling.CancelInstanceRefreshInput{ + AutoScalingGroupName: aws.String(scope.Name()), } - return true, nil + + if _, err := s.ASGClient.CancelInstanceRefresh(context.TODO(), input); err != nil { + smithyErr := awserrors.ParseSmithyError(err) + if smithyErr.ErrorCode() == (&autoscalingtypes.ActiveInstanceRefreshNotFoundFault{}).ErrorCode() { + // Refresh isn't "in progress". It may have turned to cancelled status + // by now. So this is not an error for us because we may have called + // CancelInstanceRefresh multiple times and should be idempotent here. + return nil + } + + return errors.Wrapf(err, "failed to cancel ASG instance refresh %q", scope.Name()) + } + + return nil } -// StartASGInstanceRefresh will start an ASG instance with refresh. +// StartASGInstanceRefresh will start an ASG instance refresh. func (s *Service) StartASGInstanceRefresh(scope *scope.MachinePoolScope) error { strategy := ptr.To(autoscalingtypes.RefreshStrategyRolling) var minHealthyPercentage, maxHealthyPercentage, instanceWarmup *int32 diff --git a/pkg/cloud/services/autoscaling/autoscalinggroup_test.go b/pkg/cloud/services/autoscaling/autoscalinggroup_test.go index 91660a1394..a6fd5a2178 100644 --- a/pkg/cloud/services/autoscaling/autoscalinggroup_test.go +++ b/pkg/cloud/services/autoscaling/autoscalinggroup_test.go @@ -1125,10 +1125,11 @@ func TestServiceCanStartASGInstanceRefresh(t *testing.T) { defer mockCtrl.Finish() tests := []struct { - name string - wantErr bool - canStart bool - expect func(m *mock_autoscalingiface.MockAutoScalingAPIMockRecorder) + name string + wantErr bool + wantUnfinishedRefreshStatus *string + canStart bool + expect func(m *mock_autoscalingiface.MockAutoScalingAPIMockRecorder) }{ { name: "should return error if describe instance refresh failed", @@ -1153,9 +1154,10 @@ func TestServiceCanStartASGInstanceRefresh(t *testing.T) { }, }, { - name: "should return false if some instances have unfinished refresh", - wantErr: false, - canStart: false, + name: "should return false if some instances have unfinished refresh", + wantErr: false, + wantUnfinishedRefreshStatus: aws.String(string(autoscalingtypes.InstanceRefreshStatusInProgress)), + canStart: false, expect: func(m *mock_autoscalingiface.MockAutoScalingAPIMockRecorder) { m.DescribeInstanceRefreshes(context.TODO(), gomock.Eq(&autoscaling.DescribeInstanceRefreshesInput{ AutoScalingGroupName: aws.String("machinePoolName"), @@ -1187,13 +1189,14 @@ func TestServiceCanStartASGInstanceRefresh(t *testing.T) { g.Expect(err).ToNot(HaveOccurred()) mps.AWSMachinePool.Name = "machinePoolName" - out, err := s.CanStartASGInstanceRefresh(mps) + out, unfinishedRefreshStatus, err := s.CanStartASGInstanceRefresh(mps) checkErr(tt.wantErr, err, g) if tt.canStart { g.Expect(out).To(BeTrue()) - return + } else { + g.Expect(out).To(BeFalse()) + g.Expect(unfinishedRefreshStatus).To(BeEquivalentTo(tt.wantUnfinishedRefreshStatus)) } - g.Expect(out).To(BeFalse()) }) } } diff --git a/pkg/cloud/services/autoscaling/mock_autoscalingiface/autoscaling_mock.go b/pkg/cloud/services/autoscaling/mock_autoscalingiface/autoscaling_mock.go index 2f03a558db..b589873c98 100644 --- a/pkg/cloud/services/autoscaling/mock_autoscalingiface/autoscaling_mock.go +++ b/pkg/cloud/services/autoscaling/mock_autoscalingiface/autoscaling_mock.go @@ -51,6 +51,26 @@ func (m *MockAutoScalingAPI) EXPECT() *MockAutoScalingAPIMockRecorder { return m.recorder } +// CancelInstanceRefresh mocks base method. +func (m *MockAutoScalingAPI) CancelInstanceRefresh(arg0 context.Context, arg1 *autoscaling.CancelInstanceRefreshInput, arg2 ...func(*autoscaling.Options)) (*autoscaling.CancelInstanceRefreshOutput, error) { + m.ctrl.T.Helper() + varargs := []interface{}{arg0, arg1} + for _, a := range arg2 { + varargs = append(varargs, a) + } + ret := m.ctrl.Call(m, "CancelInstanceRefresh", varargs...) + ret0, _ := ret[0].(*autoscaling.CancelInstanceRefreshOutput) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// CancelInstanceRefresh indicates an expected call of CancelInstanceRefresh. +func (mr *MockAutoScalingAPIMockRecorder) CancelInstanceRefresh(arg0, arg1 interface{}, arg2 ...interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + varargs := append([]interface{}{arg0, arg1}, arg2...) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "CancelInstanceRefresh", reflect.TypeOf((*MockAutoScalingAPI)(nil).CancelInstanceRefresh), varargs...) +} + // CreateAutoScalingGroup mocks base method. func (m *MockAutoScalingAPI) CreateAutoScalingGroup(arg0 context.Context, arg1 *autoscaling.CreateAutoScalingGroupInput, arg2 ...func(*autoscaling.Options)) (*autoscaling.CreateAutoScalingGroupOutput, error) { m.ctrl.T.Helper() diff --git a/pkg/cloud/services/autoscaling/service.go b/pkg/cloud/services/autoscaling/service.go index ec9019bbf9..714da1899d 100644 --- a/pkg/cloud/services/autoscaling/service.go +++ b/pkg/cloud/services/autoscaling/service.go @@ -37,6 +37,7 @@ type Service struct { // AutoScalingAPI is an interface for the AWS AutoScaling API client. type AutoScalingAPI interface { + CancelInstanceRefresh(ctx context.Context, params *autoscaling.CancelInstanceRefreshInput, optFns ...func(*autoscaling.Options)) (*autoscaling.CancelInstanceRefreshOutput, error) CreateAutoScalingGroup(ctx context.Context, params *autoscaling.CreateAutoScalingGroupInput, optFns ...func(*autoscaling.Options)) (*autoscaling.CreateAutoScalingGroupOutput, error) DeleteAutoScalingGroup(ctx context.Context, params *autoscaling.DeleteAutoScalingGroupInput, optFns ...func(*autoscaling.Options)) (*autoscaling.DeleteAutoScalingGroupOutput, error) DescribeAutoScalingGroups(ctx context.Context, params *autoscaling.DescribeAutoScalingGroupsInput, optFns ...func(*autoscaling.Options)) (*autoscaling.DescribeAutoScalingGroupsOutput, error) diff --git a/pkg/cloud/services/ec2/launchtemplate.go b/pkg/cloud/services/ec2/launchtemplate.go index 163247c15a..dcf83ec34d 100644 --- a/pkg/cloud/services/ec2/launchtemplate.go +++ b/pkg/cloud/services/ec2/launchtemplate.go @@ -23,7 +23,9 @@ import ( "sort" "strconv" "strings" + "time" + autoscalingtypes "github.com/aws/aws-sdk-go-v2/service/autoscaling/types" "github.com/aws/aws-sdk-go/aws" "github.com/aws/aws-sdk-go/service/ec2" "github.com/blang/semver" @@ -34,6 +36,7 @@ import ( corev1 "k8s.io/api/core/v1" apimachinerytypes "k8s.io/apimachinery/pkg/types" "k8s.io/utils/ptr" + ctrl "sigs.k8s.io/controller-runtime" infrav1 "sigs.k8s.io/cluster-api-provider-aws/v2/api/v1beta2" expinfrav1 "sigs.k8s.io/cluster-api-provider-aws/v2/exp/api/v1beta2" @@ -68,25 +71,26 @@ func (s *Service) ReconcileLaunchTemplate( s3Scope scope.S3Scope, ec2svc services.EC2Interface, objectStoreSvc services.ObjectStoreInterface, - canUpdateLaunchTemplate func() (bool, error), + canStartInstanceRefresh func() (bool, *autoscalingtypes.InstanceRefreshStatus, error), + cancelInstanceRefresh func() error, runPostLaunchTemplateUpdateOperation func() error, -) error { +) (*ctrl.Result, error) { bootstrapData, bootstrapDataFormat, bootstrapDataSecretKey, err := scope.GetRawBootstrapData() if err != nil { record.Eventf(scope.GetMachinePool(), corev1.EventTypeWarning, "FailedGetBootstrapData", err.Error()) - return err + return nil, err } scope.Info("checking for existing launch template") launchTemplate, launchTemplateUserDataHash, launchTemplateUserDataSecretKey, _, err := ec2svc.GetLaunchTemplate(scope.LaunchTemplateName()) if err != nil { conditions.MarkUnknown(scope.GetSetter(), expinfrav1.LaunchTemplateReadyCondition, expinfrav1.LaunchTemplateNotFoundReason, "%s", err.Error()) - return err + return nil, err } imageID, err := ec2svc.DiscoverLaunchTemplateAMI(ctx, scope) if err != nil { conditions.MarkFalse(scope.GetSetter(), expinfrav1.LaunchTemplateReadyCondition, expinfrav1.LaunchTemplateCreateFailedReason, clusterv1.ConditionSeverityError, "%s", err.Error()) - return err + return nil, err } var ignitionStorageType = infrav1.DefaultMachinePoolIgnitionStorageType @@ -99,7 +103,7 @@ func (s *Service) ReconcileLaunchTemplate( var userDataForLaunchTemplate []byte if bootstrapDataFormat == "ignition" && ignitionStorageType == infrav1.IgnitionStorageTypeOptionClusterObjectStore { if s3Scope.Bucket() == nil { - return errors.New("using Ignition with `AWSMachinePool.spec.ignition.storageType=ClusterObjectStore` " + + return nil, errors.New("using Ignition with `AWSMachinePool.spec.ignition.storageType=ClusterObjectStore` " + "requires a cluster wide object storage configured at `AWSCluster.spec.s3Bucket`") } @@ -114,14 +118,14 @@ func (s *Service) ReconcileLaunchTemplate( if err != nil { conditions.MarkFalse(scope.GetSetter(), expinfrav1.LaunchTemplateReadyCondition, expinfrav1.LaunchTemplateReconcileFailedReason, clusterv1.ConditionSeverityError, "%s", err.Error()) - return err + return nil, err } semver, err := semver.ParseTolerant(ignitionVersion) if err != nil { err = errors.Wrapf(err, "failed to parse ignition version %q", ignitionVersion) conditions.MarkFalse(scope.GetSetter(), expinfrav1.LaunchTemplateReadyCondition, expinfrav1.LaunchTemplateReconcileFailedReason, clusterv1.ConditionSeverityError, "%s", err.Error()) - return err + return nil, err } // EC2 user data points to S3 @@ -144,7 +148,7 @@ func (s *Service) ReconcileLaunchTemplate( if err != nil { err = errors.Wrap(err, "failed to convert ignition config to JSON") conditions.MarkFalse(scope.GetSetter(), expinfrav1.LaunchTemplateReadyCondition, expinfrav1.LaunchTemplateReconcileFailedReason, clusterv1.ConditionSeverityError, "%s", err.Error()) - return err + return nil, err } case 3: ignData := &ignV3Types.Config{ @@ -164,12 +168,12 @@ func (s *Service) ReconcileLaunchTemplate( if err != nil { err = errors.Wrap(err, "failed to convert ignition config to JSON") conditions.MarkFalse(scope.GetSetter(), expinfrav1.LaunchTemplateReadyCondition, expinfrav1.LaunchTemplateReconcileFailedReason, clusterv1.ConditionSeverityError, "%s", err.Error()) - return err + return nil, err } default: err = errors.Errorf("unsupported ignition version %q", ignitionVersion) conditions.MarkFalse(scope.GetSetter(), expinfrav1.LaunchTemplateReadyCondition, expinfrav1.LaunchTemplateReconcileFailedReason, clusterv1.ConditionSeverityError, "%s", err.Error()) - return err + return nil, err } } else { // S3 bucket not used, so the bootstrap data is stored directly in the launch template @@ -184,11 +188,11 @@ func (s *Service) ReconcileLaunchTemplate( launchTemplateID, err := ec2svc.CreateLaunchTemplate(scope, imageID, *bootstrapDataSecretKey, userDataForLaunchTemplate, userdata.ComputeHash(bootstrapData)) if err != nil { conditions.MarkFalse(scope.GetSetter(), expinfrav1.LaunchTemplateReadyCondition, expinfrav1.LaunchTemplateCreateFailedReason, clusterv1.ConditionSeverityError, "%s", err.Error()) - return err + return nil, err } scope.SetLaunchTemplateIDStatus(launchTemplateID) - return scope.PatchObject() + return nil, scope.PatchObject() } // LaunchTemplateID is set during LaunchTemplate creation, but for a scenario such as `clusterctl move`, status fields become blank. @@ -197,27 +201,29 @@ func (s *Service) ReconcileLaunchTemplate( launchTemplateID, err := ec2svc.GetLaunchTemplateID(scope.LaunchTemplateName()) if err != nil { conditions.MarkUnknown(scope.GetSetter(), expinfrav1.LaunchTemplateReadyCondition, expinfrav1.LaunchTemplateNotFoundReason, "%s", err.Error()) - return err + return nil, err } scope.SetLaunchTemplateIDStatus(launchTemplateID) - return scope.PatchObject() + if err = scope.PatchObject(); err != nil { + return nil, err + } } if scope.GetLaunchTemplateLatestVersionStatus() == "" { launchTemplateVersion, err := ec2svc.GetLaunchTemplateLatestVersion(scope.GetLaunchTemplateIDStatus()) if err != nil { conditions.MarkUnknown(scope.GetSetter(), expinfrav1.LaunchTemplateReadyCondition, expinfrav1.LaunchTemplateNotFoundReason, "%s", err.Error()) - return err + return nil, err } scope.SetLaunchTemplateLatestVersionStatus(launchTemplateVersion) - if err := scope.PatchObject(); err != nil { - return err + if err = scope.PatchObject(); err != nil { + return nil, err } } annotation, err := MachinePoolAnnotationJSON(scope, TagsLastAppliedAnnotation) if err != nil { - return err + return nil, err } // Check if the instance tags were changed. If they were, create a new LaunchTemplate. @@ -225,7 +231,7 @@ func (s *Service) ReconcileLaunchTemplate( needsUpdate, err := ec2svc.LaunchTemplateNeedsUpdate(scope, scope.GetLaunchTemplate(), launchTemplate) if err != nil { - return err + return nil, err } amiChanged := *imageID != *launchTemplate.AMI.ID @@ -239,13 +245,29 @@ func (s *Service) ReconcileLaunchTemplate( launchTemplateNeedsUserDataSecretKeyTag := launchTemplateUserDataSecretKey == nil if needsUpdate || tagsChanged || amiChanged || userDataSecretKeyChanged { - canUpdate, err := canUpdateLaunchTemplate() + // More than just the bootstrap token changed + + canStartRefresh, unfinishedRefreshStatus, err := canStartInstanceRefresh() if err != nil { - return err + return nil, err } - if !canUpdate { - conditions.MarkFalse(scope.GetSetter(), expinfrav1.PreLaunchTemplateUpdateCheckCondition, expinfrav1.PreLaunchTemplateUpdateCheckFailedReason, clusterv1.ConditionSeverityWarning, "") - return errors.New("Cannot update the launch template, prerequisite not met") + if !canStartRefresh { + if unfinishedRefreshStatus != nil && *unfinishedRefreshStatus != autoscalingtypes.InstanceRefreshStatusCancelling { + // Until the previous instance refresh goes into `Cancelled` state + // asynchronously, allowing another refresh to be started, + // defer the reconciliation. Otherwise, we get an + // `ErrCodeInstanceRefreshInProgressFault` error if we tried to + // start an instance refresh immediately. + scope.Info("Cancelling previous instance refresh and delaying reconciliation until the next one can be started", "unfinishedRefreshStatus", unfinishedRefreshStatus) + + err := cancelInstanceRefresh() + if err != nil { + return nil, err + } + } else { + scope.Info("Existing instance refresh is not finished, delaying reconciliation until the next one can be started", "unfinishedRefreshStatus", unfinishedRefreshStatus) + } + return &ctrl.Result{RequeueAfter: 30 * time.Second}, nil } } @@ -260,7 +282,7 @@ func (s *Service) ReconcileLaunchTemplate( // We ensure that the number of versions does not grow without bound by following a simple rule: Before we create a new version, we delete one old version, if there is at least one old version that is not in use. deletedLaunchTemplateVersion, err := ec2svc.PruneLaunchTemplateVersions(scope.GetLaunchTemplateIDStatus()) if err != nil { - return err + return nil, err } // S3 objects should be deleted as soon as possible if they're not used @@ -269,7 +291,7 @@ func (s *Service) ReconcileLaunchTemplate( if feature.Gates.Enabled(feature.MachinePool) && deletedLaunchTemplateVersion != nil { _, _, _, deletedLaunchTemplateVersionBootstrapDataHash, err := s.SDKToLaunchTemplate(deletedLaunchTemplateVersion) if err != nil { - return err + return nil, err } if deletedLaunchTemplateVersionBootstrapDataHash != nil && s3Scope.Bucket() != nil && bootstrapDataFormat == "ignition" && ignitionStorageType == infrav1.IgnitionStorageTypeOptionClusterObjectStore { @@ -285,28 +307,28 @@ func (s *Service) ReconcileLaunchTemplate( } if err := ec2svc.CreateLaunchTemplateVersion(scope.GetLaunchTemplateIDStatus(), scope, imageID, *bootstrapDataSecretKey, userDataForLaunchTemplate, userdata.ComputeHash(bootstrapData)); err != nil { - return err + return nil, err } version, err := ec2svc.GetLaunchTemplateLatestVersion(scope.GetLaunchTemplateIDStatus()) if err != nil { - return err + return nil, err } scope.SetLaunchTemplateLatestVersionStatus(version) if err := scope.PatchObject(); err != nil { - return err + return nil, err } } if needsUpdate || tagsChanged || amiChanged || userDataSecretKeyChanged { if err := runPostLaunchTemplateUpdateOperation(); err != nil { conditions.MarkFalse(scope.GetSetter(), expinfrav1.PostLaunchTemplateUpdateOperationCondition, expinfrav1.PostLaunchTemplateUpdateOperationFailedReason, clusterv1.ConditionSeverityError, "%s", err.Error()) - return err + return nil, err } conditions.MarkTrue(scope.GetSetter(), expinfrav1.PostLaunchTemplateUpdateOperationCondition) } - return nil + return nil, nil } // ReconcileTags reconciles the tags for the AWSMachinePool instances. diff --git a/pkg/cloud/services/interfaces.go b/pkg/cloud/services/interfaces.go index 2641f0eec2..11a0bb9823 100644 --- a/pkg/cloud/services/interfaces.go +++ b/pkg/cloud/services/interfaces.go @@ -20,8 +20,10 @@ package services import ( "context" + autoscalingtypes "github.com/aws/aws-sdk-go-v2/service/autoscaling/types" "github.com/aws/aws-sdk-go/service/ec2" apimachinerytypes "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" infrav1 "sigs.k8s.io/cluster-api-provider-aws/v2/api/v1beta2" expinfrav1 "sigs.k8s.io/cluster-api-provider-aws/v2/exp/api/v1beta2" @@ -44,8 +46,9 @@ type ASGInterface interface { GetASGByName(scope *scope.MachinePoolScope) (*expinfrav1.AutoScalingGroup, error) CreateASG(scope *scope.MachinePoolScope) (*expinfrav1.AutoScalingGroup, error) UpdateASG(scope *scope.MachinePoolScope) error + CancelASGInstanceRefresh(scope *scope.MachinePoolScope) error StartASGInstanceRefresh(scope *scope.MachinePoolScope) error - CanStartASGInstanceRefresh(scope *scope.MachinePoolScope) (bool, error) + CanStartASGInstanceRefresh(scope *scope.MachinePoolScope) (bool, *autoscalingtypes.InstanceRefreshStatus, error) UpdateResourceTags(resourceID *string, create, remove map[string]string) error DeleteASGAndWait(id string) error SuspendProcesses(name string, processes []string) error @@ -97,7 +100,7 @@ type EC2Interface interface { // separate from EC2Interface so that we can mock AWS requests separately. For example, by not mocking the // ReconcileLaunchTemplate function, but mocking EC2Interface, we can test which EC2 API operations would have been called. type MachinePoolReconcileInterface interface { - ReconcileLaunchTemplate(ctx context.Context, ignitionScope scope.IgnitionScope, scope scope.LaunchTemplateScope, s3Scope scope.S3Scope, ec2svc EC2Interface, objectStoreSvc ObjectStoreInterface, canUpdateLaunchTemplate func() (bool, error), runPostLaunchTemplateUpdateOperation func() error) error + ReconcileLaunchTemplate(ctx context.Context, ignitionScope scope.IgnitionScope, scope scope.LaunchTemplateScope, s3Scope scope.S3Scope, ec2svc EC2Interface, objectStoreSvc ObjectStoreInterface, canUpdateLaunchTemplate func() (bool, *autoscalingtypes.InstanceRefreshStatus, error), cancelInstanceRefresh func() error, runPostLaunchTemplateUpdateOperation func() error) (*ctrl.Result, error) ReconcileTags(scope scope.LaunchTemplateScope, resourceServicesToUpdate []scope.ResourceServiceToUpdate) error } diff --git a/pkg/cloud/services/mock_services/autoscaling_interface_mock.go b/pkg/cloud/services/mock_services/autoscaling_interface_mock.go index fcee553a07..7381f6b913 100644 --- a/pkg/cloud/services/mock_services/autoscaling_interface_mock.go +++ b/pkg/cloud/services/mock_services/autoscaling_interface_mock.go @@ -24,6 +24,7 @@ import ( context "context" reflect "reflect" + types "github.com/aws/aws-sdk-go-v2/service/autoscaling/types" gomock "github.com/golang/mock/gomock" v1beta2 "sigs.k8s.io/cluster-api-provider-aws/v2/exp/api/v1beta2" scope "sigs.k8s.io/cluster-api-provider-aws/v2/pkg/cloud/scope" @@ -68,12 +69,13 @@ func (mr *MockASGInterfaceMockRecorder) ASGIfExists(arg0 interface{}) *gomock.Ca } // CanStartASGInstanceRefresh mocks base method. -func (m *MockASGInterface) CanStartASGInstanceRefresh(arg0 *scope.MachinePoolScope) (bool, error) { +func (m *MockASGInterface) CanStartASGInstanceRefresh(arg0 *scope.MachinePoolScope) (bool, *types.InstanceRefreshStatus, error) { m.ctrl.T.Helper() ret := m.ctrl.Call(m, "CanStartASGInstanceRefresh", arg0) ret0, _ := ret[0].(bool) - ret1, _ := ret[1].(error) - return ret0, ret1 + ret1, _ := ret[1].(*types.InstanceRefreshStatus) + ret2, _ := ret[2].(error) + return ret0, ret1, ret2 } // CanStartASGInstanceRefresh indicates an expected call of CanStartASGInstanceRefresh. @@ -82,6 +84,20 @@ func (mr *MockASGInterfaceMockRecorder) CanStartASGInstanceRefresh(arg0 interfac return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "CanStartASGInstanceRefresh", reflect.TypeOf((*MockASGInterface)(nil).CanStartASGInstanceRefresh), arg0) } +// CancelASGInstanceRefresh mocks base method. +func (m *MockASGInterface) CancelASGInstanceRefresh(arg0 *scope.MachinePoolScope) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "CancelASGInstanceRefresh", arg0) + ret0, _ := ret[0].(error) + return ret0 +} + +// CancelASGInstanceRefresh indicates an expected call of CancelASGInstanceRefresh. +func (mr *MockASGInterfaceMockRecorder) CancelASGInstanceRefresh(arg0 interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "CancelASGInstanceRefresh", reflect.TypeOf((*MockASGInterface)(nil).CancelASGInstanceRefresh), arg0) +} + // CreateASG mocks base method. func (m *MockASGInterface) CreateASG(arg0 *scope.MachinePoolScope) (*v1beta2.AutoScalingGroup, error) { m.ctrl.T.Helper() diff --git a/pkg/cloud/services/mock_services/reconcile_interface_mock.go b/pkg/cloud/services/mock_services/reconcile_interface_mock.go index f3fd1f490d..d6d393eca8 100644 --- a/pkg/cloud/services/mock_services/reconcile_interface_mock.go +++ b/pkg/cloud/services/mock_services/reconcile_interface_mock.go @@ -24,9 +24,11 @@ import ( context "context" reflect "reflect" + types "github.com/aws/aws-sdk-go-v2/service/autoscaling/types" gomock "github.com/golang/mock/gomock" scope "sigs.k8s.io/cluster-api-provider-aws/v2/pkg/cloud/scope" services "sigs.k8s.io/cluster-api-provider-aws/v2/pkg/cloud/services" + reconcile "sigs.k8s.io/controller-runtime/pkg/reconcile" ) // MockMachinePoolReconcileInterface is a mock of MachinePoolReconcileInterface interface. @@ -53,17 +55,18 @@ func (m *MockMachinePoolReconcileInterface) EXPECT() *MockMachinePoolReconcileIn } // ReconcileLaunchTemplate mocks base method. -func (m *MockMachinePoolReconcileInterface) ReconcileLaunchTemplate(arg0 context.Context, arg1 scope.IgnitionScope, arg2 scope.LaunchTemplateScope, arg3 scope.S3Scope, arg4 services.EC2Interface, arg5 services.ObjectStoreInterface, arg6 func() (bool, error), arg7 func() error) error { +func (m *MockMachinePoolReconcileInterface) ReconcileLaunchTemplate(arg0 context.Context, arg1 scope.IgnitionScope, arg2 scope.LaunchTemplateScope, arg3 scope.S3Scope, arg4 services.EC2Interface, arg5 services.ObjectStoreInterface, arg6 func() (bool, *types.InstanceRefreshStatus, error), arg7, arg8 func() error) (*reconcile.Result, error) { m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "ReconcileLaunchTemplate", arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7) - ret0, _ := ret[0].(error) - return ret0 + ret := m.ctrl.Call(m, "ReconcileLaunchTemplate", arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8) + ret0, _ := ret[0].(*reconcile.Result) + ret1, _ := ret[1].(error) + return ret0, ret1 } // ReconcileLaunchTemplate indicates an expected call of ReconcileLaunchTemplate. -func (mr *MockMachinePoolReconcileInterfaceMockRecorder) ReconcileLaunchTemplate(arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 interface{}) *gomock.Call { +func (mr *MockMachinePoolReconcileInterfaceMockRecorder) ReconcileLaunchTemplate(arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8 interface{}) *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "ReconcileLaunchTemplate", reflect.TypeOf((*MockMachinePoolReconcileInterface)(nil).ReconcileLaunchTemplate), arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "ReconcileLaunchTemplate", reflect.TypeOf((*MockMachinePoolReconcileInterface)(nil).ReconcileLaunchTemplate), arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8) } // ReconcileTags mocks base method.