diff --git a/config/crd/bases/infrastructure.cluster.x-k8s.io_awsmachinepools.yaml b/config/crd/bases/infrastructure.cluster.x-k8s.io_awsmachinepools.yaml index 1144678d9a..949d48da9f 100644 --- a/config/crd/bases/infrastructure.cluster.x-k8s.io_awsmachinepools.yaml +++ b/config/crd/bases/infrastructure.cluster.x-k8s.io_awsmachinepools.yaml @@ -1161,6 +1161,10 @@ spec: can be added as events to the Machine object and/or logged in the controller's output. type: string + infrastructureMachineKind: + description: InfrastructureMachineKind is the kind of the infrastructure + resources behind MachinePool Machines. + type: string instances: description: Instances contains the status for each instance in the pool diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index 9b51abf039..cb04a602d7 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -72,7 +72,6 @@ rules: - clusters - clusters/status - machinedeployments - - machines - machines/status verbs: - get @@ -88,6 +87,15 @@ rules: - list - patch - watch +- apiGroups: + - cluster.x-k8s.io + resources: + - machines + verbs: + - delete + - get + - list + - watch - apiGroups: - controlplane.cluster.x-k8s.io resources: @@ -150,7 +158,6 @@ rules: - awsclusters - awsfargateprofiles - awsmachinepools - - awsmachines - awsmanagedclusters - awsmanagedmachinepools - rosaclusters @@ -186,6 +193,18 @@ rules: - patch - update - watch +- apiGroups: + - infrastructure.cluster.x-k8s.io + resources: + - awsmachines + verbs: + - create + - delete + - get + - list + - patch + - update + - watch - apiGroups: - infrastructure.cluster.x-k8s.io resources: diff --git a/controllers/awsmachine_controller.go b/controllers/awsmachine_controller.go index 3380fbc856..96c63ccc89 100644 --- a/controllers/awsmachine_controller.go +++ b/controllers/awsmachine_controller.go @@ -142,10 +142,11 @@ func (r *AWSMachineReconciler) getObjectStoreService(scope scope.S3Scope) servic return s3.NewService(scope) } -// +kubebuilder:rbac:groups=infrastructure.cluster.x-k8s.io,resources=awsmachines,verbs=get;list;watch;update;patch;delete -// +kubebuilder:rbac:groups=infrastructure.cluster.x-k8s.io,resources=awsmachines/status,verbs=get;update;patch // +kubebuilder:rbac:groups=controlplane.cluster.x-k8s.io,resources=*,verbs=get;list;watch -// +kubebuilder:rbac:groups=cluster.x-k8s.io,resources=machines;machines/status,verbs=get;list;watch +// +kubebuilder:rbac:groups=infrastructure.cluster.x-k8s.io,resources=awsmachines,verbs=create;get;list;watch;update;patch;delete +// +kubebuilder:rbac:groups=infrastructure.cluster.x-k8s.io,resources=awsmachines/status,verbs=get;update;patch +// +kubebuilder:rbac:groups=cluster.x-k8s.io,resources=machines,verbs=get;list;watch;delete +// +kubebuilder:rbac:groups=cluster.x-k8s.io,resources=machines/status,verbs=get;list;watch // +kubebuilder:rbac:groups="",resources=secrets;,verbs=get;list;watch // +kubebuilder:rbac:groups="",resources=namespaces,verbs=get;list;watch // +kubebuilder:rbac:groups="",resources=events,verbs=get;list;watch;create;update;patch @@ -458,6 +459,7 @@ func (r *AWSMachineReconciler) findInstance(machineScope *scope.MachineScope, ec return instance, nil } +//nolint:gocyclo func (r *AWSMachineReconciler) reconcileNormal(_ context.Context, machineScope *scope.MachineScope, clusterScope cloud.ClusterScoper, ec2Scope scope.EC2Scope, elbScope scope.ELBScope, objectStoreScope scope.S3Scope) (ctrl.Result, error) { machineScope.Trace("Reconciling AWSMachine") @@ -481,7 +483,7 @@ func (r *AWSMachineReconciler) reconcileNormal(_ context.Context, machineScope * } // Make sure bootstrap data is available and populated. - if machineScope.Machine.Spec.Bootstrap.DataSecretName == nil { + if !machineScope.IsMachinePoolMachine() && machineScope.Machine.Spec.Bootstrap.DataSecretName == nil { machineScope.Info("Bootstrap data secret reference is not yet available") conditions.MarkFalse(machineScope.AWSMachine, infrav1.InstanceReadyCondition, infrav1.WaitingForBootstrapDataReason, clusterv1.ConditionSeverityInfo, "") return ctrl.Result{}, nil @@ -496,6 +498,12 @@ func (r *AWSMachineReconciler) reconcileNormal(_ context.Context, machineScope * conditions.MarkUnknown(machineScope.AWSMachine, infrav1.InstanceReadyCondition, infrav1.InstanceNotFoundReason, "%s", err.Error()) return ctrl.Result{}, err } + if instance == nil && machineScope.IsMachinePoolMachine() { + err = errors.New("no instance found for machine pool") + machineScope.Error(err, "unable to find instance") + conditions.MarkUnknown(machineScope.AWSMachine, infrav1.InstanceReadyCondition, infrav1.InstanceNotFoundReason, "%s", err.Error()) + return ctrl.Result{}, err + } // If the AWSMachine doesn't have our finalizer, add it. if controllerutil.AddFinalizer(machineScope.AWSMachine, infrav1.MachineFinalizer) { @@ -585,9 +593,18 @@ func (r *AWSMachineReconciler) reconcileNormal(_ context.Context, machineScope * conditions.MarkTrue(machineScope.AWSMachine, infrav1.InstanceReadyCondition) case infrav1.InstanceStateShuttingDown, infrav1.InstanceStateTerminated: machineScope.SetNotReady() - machineScope.Info("Unexpected EC2 instance termination", "state", instance.State, "instance-id", *machineScope.GetInstanceID()) - r.Recorder.Eventf(machineScope.AWSMachine, corev1.EventTypeWarning, "InstanceUnexpectedTermination", "Unexpected EC2 instance termination") - conditions.MarkFalse(machineScope.AWSMachine, infrav1.InstanceReadyCondition, infrav1.InstanceTerminatedReason, clusterv1.ConditionSeverityError, "") + + if machineScope.IsMachinePoolMachine() { + // In an auto-scaling group, instance termination is perfectly normal on scale-down + // and therefore should not be reported as error. + machineScope.Info("EC2 instance of machine pool was terminated", "state", instance.State, "instance-id", *machineScope.GetInstanceID()) + r.Recorder.Eventf(machineScope.AWSMachine, corev1.EventTypeNormal, infrav1.InstanceTerminatedReason, "EC2 instance termination") + conditions.MarkFalse(machineScope.AWSMachine, infrav1.InstanceReadyCondition, infrav1.InstanceTerminatedReason, clusterv1.ConditionSeverityInfo, "") + } else { + machineScope.Info("Unexpected EC2 instance termination", "state", instance.State, "instance-id", *machineScope.GetInstanceID()) + r.Recorder.Eventf(machineScope.AWSMachine, corev1.EventTypeWarning, "InstanceUnexpectedTermination", "Unexpected EC2 instance termination") + conditions.MarkFalse(machineScope.AWSMachine, infrav1.InstanceReadyCondition, infrav1.InstanceTerminatedReason, clusterv1.ConditionSeverityError, "") + } default: machineScope.SetNotReady() machineScope.Info("EC2 instance state is undefined", "state", instance.State, "instance-id", *machineScope.GetInstanceID()) @@ -598,14 +615,18 @@ func (r *AWSMachineReconciler) reconcileNormal(_ context.Context, machineScope * } // reconcile the deletion of the bootstrap data secret now that we have updated instance state - if deleteSecretErr := r.deleteBootstrapData(machineScope, clusterScope, objectStoreScope); deleteSecretErr != nil { - r.Log.Error(deleteSecretErr, "unable to delete secrets") - return ctrl.Result{}, deleteSecretErr - } + if !machineScope.IsMachinePoolMachine() { + if deleteSecretErr := r.deleteBootstrapData(machineScope, clusterScope, objectStoreScope); deleteSecretErr != nil { + r.Log.Error(deleteSecretErr, "unable to delete secrets") + return ctrl.Result{}, deleteSecretErr + } - if instance.State == infrav1.InstanceStateTerminated { - machineScope.SetFailureReason("UpdateError") - machineScope.SetFailureMessage(errors.Errorf("EC2 instance state %q is unexpected", instance.State)) + // For machine pool machines, it is expected that the ASG terminates instances at any time, + // so no error is logged for those. + if instance.State == infrav1.InstanceStateTerminated { + machineScope.SetFailureReason("UpdateError") + machineScope.SetFailureMessage(errors.Errorf("EC2 instance state %q is unexpected", instance.State)) + } } // tasks that can take place during all known instance states @@ -875,9 +896,13 @@ func getIgnitionVersion(scope *scope.MachineScope) string { } func (r *AWSMachineReconciler) deleteBootstrapData(machineScope *scope.MachineScope, clusterScope cloud.ClusterScoper, objectStoreScope scope.S3Scope) error { - _, userDataFormat, err := machineScope.GetRawBootstrapDataWithFormat() - if client.IgnoreNotFound(err) != nil { - return errors.Wrap(err, "failed to get raw userdata") + var userDataFormat string + var err error + if machineScope.Machine.Spec.Bootstrap.DataSecretName != nil { + _, userDataFormat, err = machineScope.GetRawBootstrapDataWithFormat() + if client.IgnoreNotFound(err) != nil { + return errors.Wrap(err, "failed to get raw userdata") + } } if machineScope.UseSecretsManager(userDataFormat) { diff --git a/exp/api/v1beta1/conversion.go b/exp/api/v1beta1/conversion.go index 16af878660..6c4ed0efa1 100644 --- a/exp/api/v1beta1/conversion.go +++ b/exp/api/v1beta1/conversion.go @@ -52,6 +52,7 @@ func (src *AWSMachinePool) ConvertTo(dstRaw conversion.Hub) error { if restored.Spec.AvailabilityZoneSubnetType != nil { dst.Spec.AvailabilityZoneSubnetType = restored.Spec.AvailabilityZoneSubnetType } + dst.Status.InfrastructureMachineKind = restored.Status.InfrastructureMachineKind if restored.Spec.AWSLaunchTemplate.PrivateDNSName != nil { dst.Spec.AWSLaunchTemplate.PrivateDNSName = restored.Spec.AWSLaunchTemplate.PrivateDNSName @@ -90,7 +91,6 @@ func (src *AWSMachinePoolList) ConvertTo(dstRaw conversion.Hub) error { // ConvertFrom converts the v1beta2 AWSMachinePoolList receiver to v1beta1 AWSMachinePoolList. func (r *AWSMachinePoolList) ConvertFrom(srcRaw conversion.Hub) error { src := srcRaw.(*infrav1exp.AWSMachinePoolList) - return Convert_v1beta2_AWSMachinePoolList_To_v1beta1_AWSMachinePoolList(src, r, nil) } @@ -149,6 +149,10 @@ func Convert_v1beta2_AWSManagedMachinePoolSpec_To_v1beta1_AWSManagedMachinePoolS return autoConvert_v1beta2_AWSManagedMachinePoolSpec_To_v1beta1_AWSManagedMachinePoolSpec(in, out, s) } +func Convert_v1beta2_AWSMachinePoolStatus_To_v1beta1_AWSMachinePoolStatus(in *infrav1exp.AWSMachinePoolStatus, out *AWSMachinePoolStatus, s apiconversion.Scope) error { + return autoConvert_v1beta2_AWSMachinePoolStatus_To_v1beta1_AWSMachinePoolStatus(in, out, s) +} + // ConvertTo converts the v1beta1 AWSManagedMachinePoolList receiver to a v1beta2 AWSManagedMachinePoolList. func (src *AWSManagedMachinePoolList) ConvertTo(dstRaw conversion.Hub) error { dst := dstRaw.(*infrav1exp.AWSManagedMachinePoolList) diff --git a/exp/api/v1beta1/zz_generated.conversion.go b/exp/api/v1beta1/zz_generated.conversion.go index 7465c4a4c5..d24c482756 100644 --- a/exp/api/v1beta1/zz_generated.conversion.go +++ b/exp/api/v1beta1/zz_generated.conversion.go @@ -99,11 +99,6 @@ func RegisterConversions(s *runtime.Scheme) error { }); err != nil { return err } - if err := s.AddGeneratedConversionFunc((*v1beta2.AWSMachinePoolStatus)(nil), (*AWSMachinePoolStatus)(nil), func(a, b interface{}, scope conversion.Scope) error { - return Convert_v1beta2_AWSMachinePoolStatus_To_v1beta1_AWSMachinePoolStatus(a.(*v1beta2.AWSMachinePoolStatus), b.(*AWSMachinePoolStatus), scope) - }); err != nil { - return err - } if err := s.AddGeneratedConversionFunc((*AWSManagedMachinePool)(nil), (*v1beta2.AWSManagedMachinePool)(nil), func(a, b interface{}, scope conversion.Scope) error { return Convert_v1beta1_AWSManagedMachinePool_To_v1beta2_AWSManagedMachinePool(a.(*AWSManagedMachinePool), b.(*v1beta2.AWSManagedMachinePool), scope) }); err != nil { @@ -299,6 +294,11 @@ func RegisterConversions(s *runtime.Scheme) error { }); err != nil { return err } + if err := s.AddConversionFunc((*v1beta2.AWSMachinePoolStatus)(nil), (*AWSMachinePoolStatus)(nil), func(a, b interface{}, scope conversion.Scope) error { + return Convert_v1beta2_AWSMachinePoolStatus_To_v1beta1_AWSMachinePoolStatus(a.(*v1beta2.AWSMachinePoolStatus), b.(*AWSMachinePoolStatus), scope) + }); err != nil { + return err + } if err := s.AddConversionFunc((*v1beta2.AWSManagedMachinePoolSpec)(nil), (*AWSManagedMachinePoolSpec)(nil), func(a, b interface{}, scope conversion.Scope) error { return Convert_v1beta2_AWSManagedMachinePoolSpec_To_v1beta1_AWSManagedMachinePoolSpec(a.(*v1beta2.AWSManagedMachinePoolSpec), b.(*AWSManagedMachinePoolSpec), scope) }); err != nil { @@ -594,17 +594,13 @@ func autoConvert_v1beta2_AWSMachinePoolStatus_To_v1beta1_AWSMachinePoolStatus(in out.Instances = *(*[]AWSMachinePoolInstanceStatus)(unsafe.Pointer(&in.Instances)) out.LaunchTemplateID = in.LaunchTemplateID out.LaunchTemplateVersion = (*string)(unsafe.Pointer(in.LaunchTemplateVersion)) + // WARNING: in.InfrastructureMachineKind requires manual conversion: does not exist in peer-type out.FailureReason = (*string)(unsafe.Pointer(in.FailureReason)) out.FailureMessage = (*string)(unsafe.Pointer(in.FailureMessage)) out.ASGStatus = (*ASGStatus)(unsafe.Pointer(in.ASGStatus)) return nil } -// Convert_v1beta2_AWSMachinePoolStatus_To_v1beta1_AWSMachinePoolStatus is an autogenerated conversion function. -func Convert_v1beta2_AWSMachinePoolStatus_To_v1beta1_AWSMachinePoolStatus(in *v1beta2.AWSMachinePoolStatus, out *AWSMachinePoolStatus, s conversion.Scope) error { - return autoConvert_v1beta2_AWSMachinePoolStatus_To_v1beta1_AWSMachinePoolStatus(in, out, s) -} - func autoConvert_v1beta1_AWSManagedMachinePool_To_v1beta2_AWSManagedMachinePool(in *AWSManagedMachinePool, out *v1beta2.AWSManagedMachinePool, s conversion.Scope) error { out.ObjectMeta = in.ObjectMeta if err := Convert_v1beta1_AWSManagedMachinePoolSpec_To_v1beta2_AWSManagedMachinePoolSpec(&in.Spec, &out.Spec, s); err != nil { diff --git a/exp/api/v1beta2/awsmachinepool_types.go b/exp/api/v1beta2/awsmachinepool_types.go index 6d7c8d10cb..c9451730e5 100644 --- a/exp/api/v1beta2/awsmachinepool_types.go +++ b/exp/api/v1beta2/awsmachinepool_types.go @@ -209,6 +209,10 @@ type AWSMachinePoolStatus struct { // +optional LaunchTemplateVersion *string `json:"launchTemplateVersion,omitempty"` + // InfrastructureMachineKind is the kind of the infrastructure resources behind MachinePool Machines. + // +optional + InfrastructureMachineKind string `json:"infrastructureMachineKind,omitempty"` + // FailureReason will be set in the event that there is a terminal problem // reconciling the Machine and will contain a succinct value suitable // for machine interpretation. diff --git a/exp/api/v1beta2/conditions_consts.go b/exp/api/v1beta2/conditions_consts.go index 2b5acd8f20..0c7b0fbd9a 100644 --- a/exp/api/v1beta2/conditions_consts.go +++ b/exp/api/v1beta2/conditions_consts.go @@ -54,6 +54,11 @@ const ( InstanceRefreshNotReadyReason = "InstanceRefreshNotReady" // InstanceRefreshFailedReason used to report when there instance refresh is not initiated. InstanceRefreshFailedReason = "InstanceRefreshFailed" + + // AWSMachineCreationFailed reports if creating AWSMachines to represent ASG (machine pool) machines failed. + AWSMachineCreationFailed = "AWSMachineCreationFailed" + // AWSMachineDeletionFailed reports if deleting AWSMachines failed. + AWSMachineDeletionFailed = "AWSMachineDeletionFailed" ) const ( diff --git a/exp/api/v1beta2/types.go b/exp/api/v1beta2/types.go index c4455d4f83..aaf0487147 100644 --- a/exp/api/v1beta2/types.go +++ b/exp/api/v1beta2/types.go @@ -22,6 +22,11 @@ import ( infrav1 "sigs.k8s.io/cluster-api-provider-aws/v2/api/v1beta2" ) +const ( + // KindMachinePool is a MachinePool resource Kind + KindMachinePool string = "MachinePool" +) + // EBS can be used to automatically set up EBS volumes when an instance is launched. type EBS struct { // Encrypted is whether the volume should be encrypted or not. diff --git a/exp/controllers/awsmachinepool_controller.go b/exp/controllers/awsmachinepool_controller.go index 837d1f8346..4c50142390 100644 --- a/exp/controllers/awsmachinepool_controller.go +++ b/exp/controllers/awsmachinepool_controller.go @@ -20,6 +20,7 @@ package controllers import ( "context" "fmt" + "time" "github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp/cmpopts" @@ -43,6 +44,7 @@ import ( "sigs.k8s.io/cluster-api-provider-aws/v2/controllers" ekscontrolplanev1 "sigs.k8s.io/cluster-api-provider-aws/v2/controlplane/eks/api/v1beta2" expinfrav1 "sigs.k8s.io/cluster-api-provider-aws/v2/exp/api/v1beta2" + "sigs.k8s.io/cluster-api-provider-aws/v2/feature" "sigs.k8s.io/cluster-api-provider-aws/v2/pkg/cloud" "sigs.k8s.io/cluster-api-provider-aws/v2/pkg/cloud/scope" "sigs.k8s.io/cluster-api-provider-aws/v2/pkg/cloud/services" @@ -174,19 +176,27 @@ func (r *AWSMachinePoolReconciler) Reconcile(ctx context.Context, req ctrl.Reque } }() + if feature.Gates.Enabled(feature.MachinePoolMachines) { + // Patch now so that the status and selectors are available. + awsMachinePool.Status.InfrastructureMachineKind = "AWSMachine" + if err := machinePoolScope.PatchObject(); err != nil { + return ctrl.Result{}, errors.Wrap(err, "failed to patch AWSMachinePool status") + } + } + switch infraScope := infraCluster.(type) { case *scope.ManagedControlPlaneScope: if !awsMachinePool.ObjectMeta.DeletionTimestamp.IsZero() { - return ctrl.Result{}, r.reconcileDelete(machinePoolScope, infraScope, infraScope) + return ctrl.Result{}, r.reconcileDelete(ctx, machinePoolScope, infraScope, infraScope) } - return ctrl.Result{}, r.reconcileNormal(ctx, machinePoolScope, infraScope, infraScope) + return r.reconcileNormal(ctx, machinePoolScope, infraScope, infraScope) case *scope.ClusterScope: if !awsMachinePool.ObjectMeta.DeletionTimestamp.IsZero() { - return ctrl.Result{}, r.reconcileDelete(machinePoolScope, infraScope, infraScope) + return ctrl.Result{}, r.reconcileDelete(ctx, machinePoolScope, infraScope, infraScope) } - return ctrl.Result{}, r.reconcileNormal(ctx, machinePoolScope, infraScope, infraScope) + return r.reconcileNormal(ctx, machinePoolScope, infraScope, infraScope) default: return ctrl.Result{}, errors.New("infraCluster has unknown type") } @@ -226,7 +236,7 @@ func (r *AWSMachinePoolReconciler) SetupWithManager(ctx context.Context, mgr ctr Complete(r) } -func (r *AWSMachinePoolReconciler) reconcileNormal(ctx context.Context, machinePoolScope *scope.MachinePoolScope, clusterScope cloud.ClusterScoper, ec2Scope scope.EC2Scope) error { +func (r *AWSMachinePoolReconciler) reconcileNormal(ctx context.Context, machinePoolScope *scope.MachinePoolScope, clusterScope cloud.ClusterScoper, ec2Scope scope.EC2Scope) (ctrl.Result, error) { clusterScope.Info("Reconciling AWSMachinePool") // If the AWSMachine is in an error state, return early. @@ -235,28 +245,28 @@ func (r *AWSMachinePoolReconciler) reconcileNormal(ctx context.Context, machineP // TODO: If we are in a failed state, delete the secret regardless of instance state - return nil + return ctrl.Result{}, nil } // If the AWSMachinepool doesn't have our finalizer, add it if controllerutil.AddFinalizer(machinePoolScope.AWSMachinePool, expinfrav1.MachinePoolFinalizer) { // Register finalizer immediately to avoid orphaning AWS resources if err := machinePoolScope.PatchObject(); err != nil { - return err + return ctrl.Result{}, err } } if !machinePoolScope.Cluster.Status.InfrastructureReady { machinePoolScope.Info("Cluster infrastructure is not ready yet") conditions.MarkFalse(machinePoolScope.AWSMachinePool, expinfrav1.ASGReadyCondition, infrav1.WaitingForClusterInfrastructureReason, clusterv1.ConditionSeverityInfo, "") - return nil + return ctrl.Result{}, nil } // Make sure bootstrap data is available and populated if machinePoolScope.MachinePool.Spec.Template.Spec.Bootstrap.DataSecretName == nil { machinePoolScope.Info("Bootstrap data secret reference is not yet available") conditions.MarkFalse(machinePoolScope.AWSMachinePool, expinfrav1.ASGReadyCondition, infrav1.WaitingForBootstrapDataReason, clusterv1.ConditionSeverityInfo, "") - return nil + return ctrl.Result{}, nil } ec2Svc := r.getEC2Service(ec2Scope) @@ -267,7 +277,7 @@ func (r *AWSMachinePoolReconciler) reconcileNormal(ctx context.Context, machineP asg, err := r.findASG(machinePoolScope, asgsvc) if err != nil { conditions.MarkUnknown(machinePoolScope.AWSMachinePool, expinfrav1.ASGReadyCondition, expinfrav1.ASGNotFoundReason, "%s", err.Error()) - return err + return ctrl.Result{}, err } canUpdateLaunchTemplate := func() (bool, error) { @@ -307,7 +317,7 @@ func (r *AWSMachinePoolReconciler) reconcileNormal(ctx context.Context, machineP if err := reconSvc.ReconcileLaunchTemplate(machinePoolScope, ec2Svc, canUpdateLaunchTemplate, runPostLaunchTemplateUpdateOperation); err != nil { r.Recorder.Eventf(machinePoolScope.AWSMachinePool, corev1.EventTypeWarning, "FailedLaunchTemplateReconcile", "Failed to reconcile launch template: %v", err) machinePoolScope.Error(err, "failed to reconcile launch template") - return err + return ctrl.Result{}, err } // set the LaunchTemplateReady condition @@ -317,9 +327,30 @@ func (r *AWSMachinePoolReconciler) reconcileNormal(ctx context.Context, machineP // Create new ASG if err := r.createPool(machinePoolScope, clusterScope); err != nil { conditions.MarkFalse(machinePoolScope.AWSMachinePool, expinfrav1.ASGReadyCondition, expinfrav1.ASGProvisionFailedReason, clusterv1.ConditionSeverityError, "%s", err.Error()) - return err + return ctrl.Result{}, err + } + return ctrl.Result{ + RequeueAfter: 15 * time.Second, + }, nil + } + + if feature.Gates.Enabled(feature.MachinePoolMachines) { + awsMachineList, err := getAWSMachines(ctx, machinePoolScope.MachinePool, r.Client) + if err != nil { + return ctrl.Result{}, err + } + + if err := createAWSMachinesIfNotExists(ctx, awsMachineList, machinePoolScope.MachinePool, &machinePoolScope.AWSMachinePool.ObjectMeta, &machinePoolScope.AWSMachinePool.TypeMeta, asg, machinePoolScope.GetLogger(), r.Client, ec2Svc); err != nil { + machinePoolScope.SetNotReady() + conditions.MarkFalse(machinePoolScope.AWSMachinePool, clusterv1.ReadyCondition, expinfrav1.AWSMachineCreationFailed, clusterv1.ConditionSeverityWarning, "%s", err.Error()) + return ctrl.Result{}, fmt.Errorf("failed to create awsmachines: %w", err) + } + + if err := deleteOrphanedAWSMachines(ctx, awsMachineList, asg, machinePoolScope.GetLogger(), r.Client); err != nil { + machinePoolScope.SetNotReady() + conditions.MarkFalse(machinePoolScope.AWSMachinePool, clusterv1.ReadyCondition, expinfrav1.AWSMachineDeletionFailed, clusterv1.ConditionSeverityWarning, "%s", err.Error()) + return ctrl.Result{}, fmt.Errorf("failed to clean up awsmachines: %w", err) } - return nil } if annotations.ReplicasManagedByExternalAutoscaler(machinePoolScope.MachinePool) { @@ -330,14 +361,14 @@ func (r *AWSMachinePoolReconciler) reconcileNormal(ctx context.Context, machineP "external", asg.DesiredCapacity) machinePoolScope.MachinePool.Spec.Replicas = asg.DesiredCapacity if err := machinePoolScope.PatchCAPIMachinePoolObject(ctx); err != nil { - return err + return ctrl.Result{}, err } } } if err := r.updatePool(machinePoolScope, clusterScope, asg); err != nil { machinePoolScope.Error(err, "error updating AWSMachinePool") - return err + return ctrl.Result{}, err } launchTemplateID := machinePoolScope.GetLaunchTemplateIDStatus() @@ -354,7 +385,7 @@ func (r *AWSMachinePoolReconciler) reconcileNormal(ctx context.Context, machineP } err = reconSvc.ReconcileTags(machinePoolScope, resourceServiceToUpdate) if err != nil { - return errors.Wrap(err, "error updating tags") + return ctrl.Result{}, errors.Wrap(err, "error updating tags") } // Make sure Spec.ProviderID is always set. @@ -377,12 +408,28 @@ func (r *AWSMachinePoolReconciler) reconcileNormal(ctx context.Context, machineP machinePoolScope.Error(err, "failed updating instances", "instances", asg.Instances) } - return nil + if feature.Gates.Enabled(feature.MachinePoolMachines) { + return ctrl.Result{ + // Regularly update `AWSMachine` objects, for example if ASG was scaled or refreshed instances + // TODO: Requeueing interval can be removed or prolonged once reconciliation of ASG EC2 instances + // can be triggered by events (e.g. with feature gate `EventBridgeInstanceState`). + // See https://github.com/kubernetes-sigs/cluster-api-provider-aws/issues/5323. + RequeueAfter: 3 * time.Minute, + }, nil + } + + return ctrl.Result{}, nil } -func (r *AWSMachinePoolReconciler) reconcileDelete(machinePoolScope *scope.MachinePoolScope, clusterScope cloud.ClusterScoper, ec2Scope scope.EC2Scope) error { +func (r *AWSMachinePoolReconciler) reconcileDelete(ctx context.Context, machinePoolScope *scope.MachinePoolScope, clusterScope cloud.ClusterScoper, ec2Scope scope.EC2Scope) error { clusterScope.Info("Handling deleted AWSMachinePool") + if feature.Gates.Enabled(feature.MachinePoolMachines) { + if err := reconcileDeleteAWSMachines(ctx, machinePoolScope.MachinePool, r.Client, machinePoolScope.GetLogger()); err != nil { + return err + } + } + ec2Svc := r.getEC2Service(ec2Scope) asgSvc := r.getASGService(clusterScope) diff --git a/exp/controllers/awsmachinepool_controller_test.go b/exp/controllers/awsmachinepool_controller_test.go index dff7f24ff6..d849aebca2 100644 --- a/exp/controllers/awsmachinepool_controller_test.go +++ b/exp/controllers/awsmachinepool_controller_test.go @@ -33,12 +33,15 @@ import ( "k8s.io/apimachinery/pkg/runtime" apimachinerytypes "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/tools/record" + utilfeature "k8s.io/component-base/featuregate/testing" "k8s.io/klog/v2" "k8s.io/utils/ptr" + "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" infrav1 "sigs.k8s.io/cluster-api-provider-aws/v2/api/v1beta2" expinfrav1 "sigs.k8s.io/cluster-api-provider-aws/v2/exp/api/v1beta2" + "sigs.k8s.io/cluster-api-provider-aws/v2/feature" "sigs.k8s.io/cluster-api-provider-aws/v2/pkg/cloud" "sigs.k8s.io/cluster-api-provider-aws/v2/pkg/cloud/scope" "sigs.k8s.io/cluster-api-provider-aws/v2/pkg/cloud/services" @@ -48,6 +51,7 @@ import ( clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" expclusterv1 "sigs.k8s.io/cluster-api/exp/api/v1beta1" "sigs.k8s.io/cluster-api/util/conditions" + "sigs.k8s.io/cluster-api/util/labels/format" "sigs.k8s.io/cluster-api/util/patch" ) @@ -100,6 +104,7 @@ func TestAWSMachinePoolReconciler(t *testing.T) { }, }, }, + Status: expinfrav1.AWSMachinePoolStatus{}, } secret = &corev1.Secret{ @@ -119,6 +124,12 @@ func TestAWSMachinePoolReconciler(t *testing.T) { g.Expect(testEnv.Create(ctx, awsMachinePool)).To(Succeed()) g.Expect(testEnv.Create(ctx, secret)).To(Succeed()) + // Used in owner reference for AWSMachinePool AWSMachines + awsMachinePool.TypeMeta = metav1.TypeMeta{ + APIVersion: expinfrav1.GroupVersion.String(), + Kind: "AWSMachinePool", + } + cs, err = setupCluster("test-cluster") g.Expect(err).To(BeNil()) @@ -134,6 +145,11 @@ func TestAWSMachinePoolReconciler(t *testing.T) { ObjectMeta: metav1.ObjectMeta{ Name: "mp", Namespace: "default", + UID: "1", + }, + TypeMeta: metav1.TypeMeta{ + APIVersion: "cluster.x-k8s.io/v1beta1", + Kind: "MachinePool", }, Spec: expclusterv1.MachinePoolSpec{ ClusterName: "test", @@ -172,6 +188,7 @@ func TestAWSMachinePoolReconciler(t *testing.T) { return reconSvc }, Recorder: recorder, + Client: testEnv.Client, } } @@ -210,7 +227,7 @@ func TestAWSMachinePoolReconciler(t *testing.T) { buf := new(bytes.Buffer) klog.SetOutput(buf) - _ = reconciler.reconcileNormal(context.Background(), ms, cs, cs) + _, _ = reconciler.reconcileNormal(context.Background(), ms, cs, cs) g.Expect(buf).To(ContainSubstring("Error state detected, skipping reconciliation")) }) t.Run("should add our finalizer to the machinepool", func(t *testing.T) { @@ -219,7 +236,7 @@ func TestAWSMachinePoolReconciler(t *testing.T) { defer teardown(t, g) getASG(t, g) - _ = reconciler.reconcileNormal(context.Background(), ms, cs, cs) + _, _ = reconciler.reconcileNormal(context.Background(), ms, cs, cs) g.Expect(ms.AWSMachinePool.Finalizers).To(ContainElement(expinfrav1.MachinePoolFinalizer)) }) @@ -234,7 +251,7 @@ func TestAWSMachinePoolReconciler(t *testing.T) { buf := new(bytes.Buffer) klog.SetOutput(buf) - err := reconciler.reconcileNormal(context.Background(), ms, cs, cs) + _, err := reconciler.reconcileNormal(context.Background(), ms, cs, cs) g.Expect(err).To(BeNil()) g.Expect(buf.String()).To(ContainSubstring("Cluster infrastructure is not ready yet")) expectConditions(g, ms.AWSMachinePool, []conditionAssertion{{expinfrav1.ASGReadyCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityInfo, infrav1.WaitingForClusterInfrastructureReason}}) @@ -249,7 +266,7 @@ func TestAWSMachinePoolReconciler(t *testing.T) { buf := new(bytes.Buffer) klog.SetOutput(buf) - err := reconciler.reconcileNormal(context.Background(), ms, cs, cs) + _, err := reconciler.reconcileNormal(context.Background(), ms, cs, cs) g.Expect(err).To(BeNil()) g.Expect(buf.String()).To(ContainSubstring("Bootstrap data secret reference is not yet available")) @@ -277,10 +294,186 @@ func TestAWSMachinePoolReconciler(t *testing.T) { expectedErr := errors.New("no connection available ") reconSvc.EXPECT().ReconcileLaunchTemplate(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(expectedErr) - err := reconciler.reconcileNormal(context.Background(), ms, cs, cs) + _, err := reconciler.reconcileNormal(context.Background(), ms, cs, cs) g.Expect(errors.Cause(err)).To(MatchError(expectedErr)) }) }) + t.Run("there are nodes in the asg which need awsmachines", func(t *testing.T) { + t.Run("should not create awsmachines for the nodes if feature gate is disabled", func(t *testing.T) { + utilfeature.SetFeatureGateDuringTest(t, feature.Gates, feature.MachinePoolMachines, false) + + g := NewWithT(t) + setup(t, g) + defer teardown(t, g) + + asg := &expinfrav1.AutoScalingGroup{ + Name: "name", + Instances: []infrav1.Instance{ + { + ID: "1", + }, + { + ID: "2", + }, + }, + Subnets: []string{}, + } + + reconSvc.EXPECT().ReconcileLaunchTemplate(gomock.Any(), ec2Svc, gomock.Any(), gomock.Any()).Return(nil) + asgSvc.EXPECT().GetASGByName(gomock.Any()).Return(asg, nil) + asgSvc.EXPECT().SubnetIDs(gomock.Any()).Return([]string{}, nil) + asgSvc.EXPECT().UpdateASG(gomock.Any()).Return(nil) + reconSvc.EXPECT().ReconcileTags(gomock.Any(), gomock.Any()).Return(nil) + + _, err := reconciler.reconcileNormal(context.Background(), ms, cs, cs) + g.Expect(err).To(Succeed()) + + g.Eventually(func() int { + awsMachines := &infrav1.AWSMachineList{} + if err := testEnv.List(ctx, awsMachines, client.InNamespace(ms.AWSMachinePool.Namespace)); err != nil { + return -1 + } + return len(awsMachines.Items) + }).Should(BeZero()) + }) + t.Run("should create awsmachines for the nodes", func(t *testing.T) { + utilfeature.SetFeatureGateDuringTest(t, feature.Gates, feature.MachinePoolMachines, true) + + g := NewWithT(t) + setup(t, g) + defer teardown(t, g) + + asg := &expinfrav1.AutoScalingGroup{ + Name: "name", + Instances: []infrav1.Instance{ + { + ID: "1", + }, + { + ID: "2", + }, + }, + Subnets: []string{}, + } + + reconSvc.EXPECT().ReconcileLaunchTemplate(gomock.Any(), ec2Svc, gomock.Any(), gomock.Any()).Return(nil) + asgSvc.EXPECT().GetASGByName(gomock.Any()).Return(asg, nil) + ec2Svc.EXPECT().InstanceIfExists(aws.String("1")).Return(&infrav1.Instance{ID: "1", Type: "m6.2xlarge"}, nil) + ec2Svc.EXPECT().InstanceIfExists(aws.String("2")).Return(&infrav1.Instance{ID: "2", Type: "m6.2xlarge"}, nil) + asgSvc.EXPECT().SubnetIDs(gomock.Any()).Return([]string{}, nil) + asgSvc.EXPECT().UpdateASG(gomock.Any()).Return(nil) + reconSvc.EXPECT().ReconcileTags(gomock.Any(), gomock.Any()).Return(nil) + + _, err := reconciler.reconcileNormal(context.Background(), ms, cs, cs) + g.Expect(err).To(Succeed()) + + g.Eventually(func() int { + awsMachines := &infrav1.AWSMachineList{} + if err := testEnv.List(ctx, awsMachines, client.InNamespace(ms.AWSMachinePool.Namespace)); err != nil { + return -1 + } + return len(awsMachines.Items) + }).Should(BeEquivalentTo(len(asg.Instances))) + }) + t.Run("should delete awsmachines for nodes removed from the asg", func(t *testing.T) { + utilfeature.SetFeatureGateDuringTest(t, feature.Gates, feature.MachinePoolMachines, true) + + g := NewWithT(t) + setup(t, g) + defer teardown(t, g) + + asg := &expinfrav1.AutoScalingGroup{ + Name: "name", + Instances: []infrav1.Instance{ + { + ID: "1", + }, + }, + Subnets: []string{}, + } + g.Expect(testEnv.Create(context.Background(), &clusterv1.Machine{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: ms.AWSMachinePool.Namespace, + Name: "name-1", + UID: "1", + }, + Spec: clusterv1.MachineSpec{ + ClusterName: "test", + }, + })).To(Succeed()) + g.Expect(testEnv.Create(context.Background(), &infrav1.AWSMachine{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: ms.AWSMachinePool.Namespace, + Name: "name-1", + Labels: map[string]string{ + clusterv1.MachinePoolNameLabel: format.MustFormatValue(ms.MachinePool.Name), + clusterv1.ClusterNameLabel: ms.MachinePool.Spec.ClusterName, + }, + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: "v1beta1", + Kind: "Machine", + Name: "name-1", + UID: "1", + }, + }, + }, + Spec: infrav1.AWSMachineSpec{ + ProviderID: aws.String("1"), + InstanceType: "m6.2xlarge", + }, + })).To(Succeed()) + g.Expect(testEnv.Create(context.Background(), &clusterv1.Machine{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: ms.AWSMachinePool.Namespace, + Name: "name-2", + UID: "2", + }, + Spec: clusterv1.MachineSpec{ + ClusterName: "test", + }, + })).To(Succeed()) + g.Expect(testEnv.Create(context.Background(), &infrav1.AWSMachine{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: ms.AWSMachinePool.Namespace, + Name: "name-2", + Labels: map[string]string{ + clusterv1.MachinePoolNameLabel: format.MustFormatValue(ms.MachinePool.Name), + clusterv1.ClusterNameLabel: ms.MachinePool.Spec.ClusterName, + }, + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: "v1beta1", + Kind: "Machine", + Name: "name-2", + UID: "2", + }, + }, + }, + Spec: infrav1.AWSMachineSpec{ + ProviderID: aws.String("2"), + InstanceType: "m6.2xlarge", + }, + })).To(Succeed()) + + reconSvc.EXPECT().ReconcileLaunchTemplate(gomock.Any(), ec2Svc, gomock.Any(), gomock.Any()).Return(nil) + asgSvc.EXPECT().GetASGByName(gomock.Any()).Return(asg, nil) + asgSvc.EXPECT().SubnetIDs(gomock.Any()).Return([]string{}, nil) + asgSvc.EXPECT().UpdateASG(gomock.Any()).Return(nil) + reconSvc.EXPECT().ReconcileTags(gomock.Any(), gomock.Any()).Return(nil) + + _, err := reconciler.reconcileNormal(context.Background(), ms, cs, cs) + g.Expect(err).To(Succeed()) + + g.Eventually(func() int { + awsMachines := &infrav1.AWSMachineList{} + if err := testEnv.List(ctx, awsMachines, client.InNamespace(ms.AWSMachinePool.Namespace)); err != nil { + return -1 + } + return len(awsMachines.Items) + }).Should(BeEquivalentTo(len(asg.Instances))) + }) + }) t.Run("there's suspended processes provided during ASG creation", func(t *testing.T) { setSuspendedProcesses := func(t *testing.T, g *WithT) { t.Helper() @@ -304,7 +497,7 @@ func TestAWSMachinePoolReconciler(t *testing.T) { }, nil) asgSvc.EXPECT().SuspendProcesses("name", []string{"Launch", "Terminate"}).Return(nil).AnyTimes().Times(0) - err := reconciler.reconcileNormal(context.Background(), ms, cs, cs) + _, err := reconciler.reconcileNormal(context.Background(), ms, cs, cs) g.Expect(err).To(Succeed()) }) }) @@ -340,7 +533,7 @@ func TestAWSMachinePoolReconciler(t *testing.T) { "ReplaceUnhealthy", })).Return(nil).AnyTimes().Times(1) - err := reconciler.reconcileNormal(context.Background(), ms, cs, cs) + _, err := reconciler.reconcileNormal(context.Background(), ms, cs, cs) g.Expect(err).To(Succeed()) }) }) @@ -372,7 +565,7 @@ func TestAWSMachinePoolReconciler(t *testing.T) { asgSvc.EXPECT().SuspendProcesses("name", []string{"Terminate"}).Return(nil).AnyTimes().Times(1) asgSvc.EXPECT().ResumeProcesses("name", []string{"process3"}).Return(nil).AnyTimes().Times(1) - err := reconciler.reconcileNormal(context.Background(), ms, cs, cs) + _, err := reconciler.reconcileNormal(context.Background(), ms, cs, cs) g.Expect(err).To(Succeed()) }) }) @@ -397,9 +590,10 @@ func TestAWSMachinePoolReconciler(t *testing.T) { } ms.MachinePool.Spec.Replicas = ptr.To[int32](0) - g.Expect(testEnv.Create(ctx, ms.MachinePool)).To(Succeed()) + g.Expect(testEnv.Create(ctx, ms.MachinePool.DeepCopy())).To(Succeed()) - _ = reconciler.reconcileNormal(context.Background(), ms, cs, cs) + _, err := reconciler.reconcileNormal(context.Background(), ms, cs, cs) + g.Expect(err).To(Succeed()) g.Expect(*ms.MachinePool.Spec.Replicas).To(Equal(int32(1))) }) t.Run("No need to update Asg because asgNeedsUpdates is false and no subnets change", func(t *testing.T) { @@ -430,7 +624,7 @@ func TestAWSMachinePoolReconciler(t *testing.T) { asgSvc.EXPECT().SubnetIDs(gomock.Any()).Return([]string{"subnet2", "subnet1"}, nil).Times(1) asgSvc.EXPECT().UpdateASG(gomock.Any()).Return(nil).Times(0) - err := reconciler.reconcileNormal(context.Background(), ms, cs, cs) + _, err := reconciler.reconcileNormal(context.Background(), ms, cs, cs) g.Expect(err).To(Succeed()) }) t.Run("update Asg due to subnet changes", func(t *testing.T) { @@ -448,7 +642,7 @@ func TestAWSMachinePoolReconciler(t *testing.T) { asgSvc.EXPECT().SubnetIDs(gomock.Any()).Return([]string{"subnet1"}, nil).Times(1) asgSvc.EXPECT().UpdateASG(gomock.Any()).Return(nil).Times(1) - err := reconciler.reconcileNormal(context.Background(), ms, cs, cs) + _, err := reconciler.reconcileNormal(context.Background(), ms, cs, cs) g.Expect(err).To(Succeed()) }) t.Run("update Asg due to asgNeedsUpdates returns true", func(t *testing.T) { @@ -466,7 +660,7 @@ func TestAWSMachinePoolReconciler(t *testing.T) { asgSvc.EXPECT().SubnetIDs(gomock.Any()).Return([]string{}, nil).Times(1) asgSvc.EXPECT().UpdateASG(gomock.Any()).Return(nil).Times(1) - err := reconciler.reconcileNormal(context.Background(), ms, cs, cs) + _, err := reconciler.reconcileNormal(context.Background(), ms, cs, cs) g.Expect(err).To(Succeed()) }) @@ -491,7 +685,7 @@ func TestAWSMachinePoolReconciler(t *testing.T) { }, nil }) - err := reconciler.reconcileNormal(context.Background(), ms, cs, cs) + _, err := reconciler.reconcileNormal(context.Background(), ms, cs, cs) g.Expect(err).To(Succeed()) }) @@ -538,7 +732,7 @@ func TestAWSMachinePoolReconciler(t *testing.T) { // No changes, so there must not be an ASG update! asgSvc.EXPECT().UpdateASG(gomock.Any()).Times(0) - err := reconciler.reconcileNormal(context.Background(), ms, cs, cs) + _, err := reconciler.reconcileNormal(context.Background(), ms, cs, cs) g.Expect(err).To(Succeed()) }) @@ -591,7 +785,7 @@ func TestAWSMachinePoolReconciler(t *testing.T) { // No changes, so there must not be an ASG update! asgSvc.EXPECT().UpdateASG(gomock.Any()).Times(0) - err := reconciler.reconcileNormal(context.Background(), ms, cs, cs) + _, err := reconciler.reconcileNormal(context.Background(), ms, cs, cs) g.Expect(err).To(Succeed()) }) @@ -647,7 +841,7 @@ func TestAWSMachinePoolReconciler(t *testing.T) { // No changes, so there must not be an ASG update! asgSvc.EXPECT().UpdateASG(gomock.Any()).Times(0) - err := reconciler.reconcileNormal(context.Background(), ms, cs, cs) + _, err := reconciler.reconcileNormal(context.Background(), ms, cs, cs) g.Expect(err).To(Succeed()) }) @@ -669,7 +863,7 @@ func TestAWSMachinePoolReconciler(t *testing.T) { }, nil }) - err := reconciler.reconcileNormal(context.Background(), ms, cs, cs) + _, err := reconciler.reconcileNormal(context.Background(), ms, cs, cs) g.Expect(err).To(Succeed()) g.Expect(ms.AWSMachinePool.Status.LaunchTemplateID).ToNot(BeEmpty()) @@ -731,7 +925,7 @@ func TestAWSMachinePoolReconciler(t *testing.T) { // No changes, so there must not be an ASG update! asgSvc.EXPECT().UpdateASG(gomock.Any()).Times(0) - err = reconciler.reconcileNormal(context.Background(), ms, cs, cs) + _, err = reconciler.reconcileNormal(context.Background(), ms, cs, cs) g.Expect(err).To(Succeed()) }) }) @@ -755,7 +949,7 @@ func TestAWSMachinePoolReconciler(t *testing.T) { expectedErr := errors.New("no connection available ") asgSvc.EXPECT().GetASGByName(gomock.Any()).Return(nil, expectedErr).AnyTimes() - err := reconciler.reconcileDelete(ms, cs, cs) + err := reconciler.reconcileDelete(context.Background(), ms, cs, cs) g.Expect(errors.Cause(err)).To(MatchError(expectedErr)) }) t.Run("should log and remove finalizer when no machinepool exists", func(t *testing.T) { @@ -770,7 +964,7 @@ func TestAWSMachinePoolReconciler(t *testing.T) { buf := new(bytes.Buffer) klog.SetOutput(buf) - err := reconciler.reconcileDelete(ms, cs, cs) + err := reconciler.reconcileDelete(context.Background(), ms, cs, cs) g.Expect(err).To(BeNil()) g.Expect(buf.String()).To(ContainSubstring("Unable to locate ASG")) g.Expect(ms.AWSMachinePool.Finalizers).To(ConsistOf(metav1.FinalizerDeleteDependents)) @@ -791,7 +985,8 @@ func TestAWSMachinePoolReconciler(t *testing.T) { buf := new(bytes.Buffer) klog.SetOutput(buf) - err := reconciler.reconcileDelete(ms, cs, cs) + err := reconciler.reconcileDelete(context.Background(), ms, cs, cs) + g.Expect(err).To(BeNil()) g.Expect(ms.AWSMachinePool.Status.Ready).To(BeFalse()) g.Eventually(recorder.Events).Should(Receive(ContainSubstring("DeletionInProgress"))) diff --git a/exp/controllers/awsmachinepool_machines.go b/exp/controllers/awsmachinepool_machines.go new file mode 100644 index 0000000000..a7aae3d666 --- /dev/null +++ b/exp/controllers/awsmachinepool_machines.go @@ -0,0 +1,207 @@ +package controllers + +import ( + "context" + "fmt" + + "github.com/aws/aws-sdk-go/aws" + "github.com/go-logr/logr" + "github.com/pkg/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/klog/v2" + "k8s.io/utils/ptr" + "sigs.k8s.io/controller-runtime/pkg/client" + + infrav1 "sigs.k8s.io/cluster-api-provider-aws/v2/api/v1beta2" + expinfrav1 "sigs.k8s.io/cluster-api-provider-aws/v2/exp/api/v1beta2" + "sigs.k8s.io/cluster-api-provider-aws/v2/feature" + "sigs.k8s.io/cluster-api-provider-aws/v2/pkg/cloud/services" + "sigs.k8s.io/cluster-api-provider-aws/v2/pkg/cloud/services/ec2" + clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" + expclusterv1 "sigs.k8s.io/cluster-api/exp/api/v1beta1" + "sigs.k8s.io/cluster-api/util" + "sigs.k8s.io/cluster-api/util/labels/format" +) + +func createAWSMachinesIfNotExists(ctx context.Context, awsMachineList *infrav1.AWSMachineList, mp *expclusterv1.MachinePool, infraMachinePoolMeta *metav1.ObjectMeta, infraMachinePoolType *metav1.TypeMeta, existingASG *expinfrav1.AutoScalingGroup, l logr.Logger, client client.Client, ec2Svc services.EC2Interface) error { + if !feature.Gates.Enabled(feature.MachinePoolMachines) { + return errors.New("createAWSMachinesIfNotExists must not be called unless the MachinePoolMachines feature gate is enabled") + } + + l.V(4).Info("Creating missing AWSMachines") + + providerIDToAWSMachine := make(map[string]infrav1.AWSMachine, len(awsMachineList.Items)) + for i := range awsMachineList.Items { + awsMachine := awsMachineList.Items[i] + if awsMachine.Spec.ProviderID == nil || *awsMachine.Spec.ProviderID == "" { + continue + } + providerID := *awsMachine.Spec.ProviderID + providerIDToAWSMachine[providerID] = awsMachine + } + + for i := range existingASG.Instances { + instanceID := existingASG.Instances[i].ID + providerID := fmt.Sprintf("aws:///%s/%s", existingASG.Instances[i].AvailabilityZone, instanceID) + + instanceLogger := l.WithValues("providerID", providerID, "instanceID", instanceID, "asg", existingASG.Name) + instanceLogger.V(4).Info("Checking if machine pool AWSMachine is up to date") + if _, exists := providerIDToAWSMachine[providerID]; exists { + continue + } + + instance, err := ec2Svc.InstanceIfExists(&instanceID) + if errors.Is(err, ec2.ErrInstanceNotFoundByID) { + instanceLogger.V(4).Info("Instance not found, it may have already been deleted") + continue + } + if err != nil { + return fmt.Errorf("failed to look up EC2 instance %q: %w", instanceID, err) + } + + securityGroups := make([]infrav1.AWSResourceReference, 0, len(instance.SecurityGroupIDs)) + for j := range instance.SecurityGroupIDs { + securityGroups = append(securityGroups, infrav1.AWSResourceReference{ + ID: aws.String(instance.SecurityGroupIDs[j]), + }) + } + + awsMachine := &infrav1.AWSMachine{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: mp.Namespace, + GenerateName: fmt.Sprintf("%s-", existingASG.Name), + Labels: map[string]string{ + clusterv1.MachinePoolNameLabel: format.MustFormatValue(mp.Name), + clusterv1.ClusterNameLabel: mp.Spec.ClusterName, + }, + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: infraMachinePoolType.APIVersion, + Kind: infraMachinePoolType.Kind, + Name: infraMachinePoolMeta.Name, + BlockOwnerDeletion: ptr.To(true), + UID: infraMachinePoolMeta.UID, + }, + }, + }, + Spec: infrav1.AWSMachineSpec{ + ProviderID: aws.String(providerID), + InstanceID: aws.String(instanceID), + + // Store some extra fields for informational purposes (not needed by CAPA) + AMI: infrav1.AMIReference{ + ID: aws.String(instance.ImageID), + }, + InstanceType: instance.Type, + PublicIP: aws.Bool(instance.PublicIP != nil), + SSHKeyName: instance.SSHKeyName, + InstanceMetadataOptions: instance.InstanceMetadataOptions, + IAMInstanceProfile: instance.IAMProfile, + AdditionalSecurityGroups: securityGroups, + Subnet: &infrav1.AWSResourceReference{ID: aws.String(instance.SubnetID)}, + RootVolume: instance.RootVolume, + NonRootVolumes: instance.NonRootVolumes, + NetworkInterfaces: instance.NetworkInterfaces, + CloudInit: infrav1.CloudInit{}, + SpotMarketOptions: instance.SpotMarketOptions, + Tenancy: instance.Tenancy, + }, + } + instanceLogger.V(4).Info("Creating AWSMachine") + if err := client.Create(ctx, awsMachine); err != nil { + return fmt.Errorf("failed to create AWSMachine: %w", err) + } + } + return nil +} + +func deleteOrphanedAWSMachines(ctx context.Context, awsMachineList *infrav1.AWSMachineList, existingASG *expinfrav1.AutoScalingGroup, l logr.Logger, client client.Client) error { + if !feature.Gates.Enabled(feature.MachinePoolMachines) { + return errors.New("deleteOrphanedAWSMachines must not be called unless the MachinePoolMachines feature gate is enabled") + } + + l.V(4).Info("Deleting orphaned AWSMachines") + providerIDToInstance := make(map[string]infrav1.Instance, len(existingASG.Instances)) + for i := range existingASG.Instances { + providerID := fmt.Sprintf("aws:///%s/%s", existingASG.Instances[i].AvailabilityZone, existingASG.Instances[i].ID) + providerIDToInstance[providerID] = existingASG.Instances[i] + } + + for i := range awsMachineList.Items { + awsMachine := awsMachineList.Items[i] + if awsMachine.Spec.ProviderID == nil || *awsMachine.Spec.ProviderID == "" { + continue + } + + providerID := *awsMachine.Spec.ProviderID + if _, exists := providerIDToInstance[providerID]; exists { + continue + } + + machine, err := util.GetOwnerMachine(ctx, client, awsMachine.ObjectMeta) + if err != nil { + return fmt.Errorf("failed to get owner Machine for %s/%s: %w", awsMachine.Namespace, awsMachine.Name, err) + } + machineLogger := l.WithValues("machine", klog.KObj(machine), "awsmachine", klog.KObj(&awsMachine), "ProviderID", providerID) + machineLogger.V(4).Info("Deleting orphaned Machine") + if machine == nil { + machineLogger.Info("No machine owner found for AWSMachine, deleting AWSMachine anyway.") + if err := client.Delete(ctx, &awsMachine); err != nil { + return fmt.Errorf("failed to delete orphan AWSMachine %s/%s: %w", awsMachine.Namespace, awsMachine.Name, err) + } + machineLogger.V(4).Info("Deleted AWSMachine") + continue + } + + if err := client.Delete(ctx, machine); err != nil { + return fmt.Errorf("failed to delete orphan Machine %s/%s: %w", machine.Namespace, machine.Name, err) + } + machineLogger.V(4).Info("Deleted Machine") + } + return nil +} + +func getAWSMachines(ctx context.Context, mp *expclusterv1.MachinePool, kubeClient client.Client) (*infrav1.AWSMachineList, error) { + if !feature.Gates.Enabled(feature.MachinePoolMachines) { + return nil, errors.New("getAWSMachines must not be called unless the MachinePoolMachines feature gate is enabled") + } + + awsMachineList := &infrav1.AWSMachineList{} + labels := map[string]string{ + clusterv1.MachinePoolNameLabel: format.MustFormatValue(mp.Name), + clusterv1.ClusterNameLabel: mp.Spec.ClusterName, + } + if err := kubeClient.List(ctx, awsMachineList, client.InNamespace(mp.Namespace), client.MatchingLabels(labels)); err != nil { + return nil, err + } + return awsMachineList, nil +} + +func reconcileDeleteAWSMachines(ctx context.Context, mp *expclusterv1.MachinePool, client client.Client, l logr.Logger) error { + if !feature.Gates.Enabled(feature.MachinePoolMachines) { + return errors.New("reconcileDeleteAWSMachines must not be called unless the MachinePoolMachines feature gate is enabled") + } + + awsMachineList, err := getAWSMachines(ctx, mp, client) + if err != nil { + return err + } + for i := range awsMachineList.Items { + awsMachine := awsMachineList.Items[i] + if awsMachine.DeletionTimestamp.IsZero() { + continue + } + logger := l.WithValues("awsmachine", klog.KObj(&awsMachine)) + // delete the owner Machine resource for the AWSMachine so that CAPI can clean up gracefully + machine, err := util.GetOwnerMachine(ctx, client, awsMachine.ObjectMeta) + if err != nil { + logger.V(2).Info("Failed to get owner Machine", "err", err.Error()) + continue + } + + if err := client.Delete(ctx, machine); err != nil { + logger.V(2).Info("Failed to delete owner Machine", "err", err.Error()) + } + } + return nil +} diff --git a/feature/feature.go b/feature/feature.go index 061e4edd57..110c75c5ba 100644 --- a/feature/feature.go +++ b/feature/feature.go @@ -54,6 +54,12 @@ const ( // alpha: v0.1 MachinePool featuregate.Feature = "MachinePool" + // MachinePoolMachines is a feature gate that enables creation of AWSMachine objects for AWSMachinePool and AWSManagedMachinePool. + // + // owner: @AndiDog + // alpha: v2.8 + MachinePoolMachines featuregate.Feature = "MachinePoolMachines" + // EventBridgeInstanceState will use Event Bridge and notifications to keep instance state up-to-date // owner: @gab-satchi // alpha: v0.7? @@ -102,6 +108,7 @@ var defaultCAPAFeatureGates = map[featuregate.Feature]featuregate.FeatureSpec{ EKSFargate: {Default: false, PreRelease: featuregate.Alpha}, EventBridgeInstanceState: {Default: false, PreRelease: featuregate.Alpha}, MachinePool: {Default: true, PreRelease: featuregate.Beta}, + MachinePoolMachines: {Default: false, PreRelease: featuregate.Alpha}, AutoControllerIdentityCreator: {Default: true, PreRelease: featuregate.Alpha}, BootstrapFormatIgnition: {Default: false, PreRelease: featuregate.Alpha}, ExternalResourceGC: {Default: false, PreRelease: featuregate.Alpha}, diff --git a/feature/gates.go b/feature/gates.go index b3576c313c..20597a7394 100644 --- a/feature/gates.go +++ b/feature/gates.go @@ -26,7 +26,7 @@ var ( // MutableGates is a mutable version of DefaultFeatureGate. // Only top-level commands/options setup and the k8s.io/component-base/featuregate/testing package should make use of this. // Tests that need to modify featuregate gates for the duration of their test should use: - // defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features., )() + // featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features., ) MutableGates featuregate.MutableFeatureGate = feature.MutableGates // Gates is a shared global FeatureGate. diff --git a/pkg/cloud/awserrors/errors.go b/pkg/cloud/awserrors/errors.go index d51b41595c..765d3ce626 100644 --- a/pkg/cloud/awserrors/errors.go +++ b/pkg/cloud/awserrors/errors.go @@ -56,6 +56,7 @@ const ( VPCNotFound = "InvalidVpcID.NotFound" VPCMissingParameter = "MissingParameter" ErrCodeRepositoryAlreadyExistsException = "RepositoryAlreadyExistsException" + ASGNotFound = "AutoScalingGroup.NotFound" ) var _ error = &EC2Error{} @@ -172,6 +173,8 @@ func IsInvalidNotFoundError(err error) bool { return true case LaunchTemplateNameNotFound: return true + case ASGNotFound: + return true } } diff --git a/pkg/cloud/scope/machine.go b/pkg/cloud/scope/machine.go index 3f484d0fc4..243bd40242 100644 --- a/pkg/cloud/scope/machine.go +++ b/pkg/cloud/scope/machine.go @@ -29,6 +29,7 @@ import ( infrav1 "sigs.k8s.io/cluster-api-provider-aws/v2/api/v1beta2" ekscontrolplanev1 "sigs.k8s.io/cluster-api-provider-aws/v2/controlplane/eks/api/v1beta2" + "sigs.k8s.io/cluster-api-provider-aws/v2/exp/api/v1beta2" "sigs.k8s.io/cluster-api-provider-aws/v2/pkg/logger" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" "sigs.k8s.io/cluster-api/util" @@ -113,6 +114,19 @@ func (m *MachineScope) IsControlPlane() bool { return util.IsControlPlaneMachine(m.Machine) } +// IsMachinePoolMachine returns true if the machine is created for a machinepool. +func (m *MachineScope) IsMachinePoolMachine() bool { + if _, ok := m.Machine.GetLabels()[clusterv1.MachinePoolNameLabel]; ok { + return true + } + for _, owner := range m.Machine.OwnerReferences { + if owner.Kind == v1beta2.KindMachinePool { + return true + } + } + return false +} + // Role returns the machine role from the labels. func (m *MachineScope) Role() string { if util.IsControlPlaneMachine(m.Machine) { diff --git a/pkg/cloud/services/ec2/instances.go b/pkg/cloud/services/ec2/instances.go index 7cde69b03e..4d1f534795 100644 --- a/pkg/cloud/services/ec2/instances.go +++ b/pkg/cloud/services/ec2/instances.go @@ -1083,7 +1083,7 @@ func (s *Service) GetDHCPOptionSetDomainName(ec2client ec2iface.EC2API, vpcID *s log := s.scope.GetLogger() if vpcID == nil { - log.Info("vpcID is nil, skipping DHCP Option Set discovery") + log.V(4).Info("vpcID is nil, skipping DHCP Option Set discovery") return nil }