diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index 0338fde577..401fd99abc 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -175,6 +175,7 @@ rules: resources: - awsclusters/status - awsfargateprofiles/status + - awsmachinetemplates/status - rosaclusters/status - rosanetworks/status - rosaroleconfigs/status diff --git a/controllers/awsmachinetemplate_controller.go b/controllers/awsmachinetemplate_controller.go new file mode 100644 index 0000000000..0d81030772 --- /dev/null +++ b/controllers/awsmachinetemplate_controller.go @@ -0,0 +1,193 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controllers + +import ( + "context" + + "github.com/aws/aws-sdk-go-v2/service/ec2" + ec2types "github.com/aws/aws-sdk-go-v2/service/ec2/types" + "github.com/pkg/errors" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/resource" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller" + + infrav1 "sigs.k8s.io/cluster-api-provider-aws/v2/api/v1beta2" + "sigs.k8s.io/cluster-api-provider-aws/v2/pkg/cloud/scope" + "sigs.k8s.io/cluster-api-provider-aws/v2/pkg/logger" + "sigs.k8s.io/cluster-api-provider-aws/v2/pkg/record" + "sigs.k8s.io/cluster-api/util" + "sigs.k8s.io/cluster-api/util/predicates" +) + +// AWSMachineTemplateReconciler reconciles AWSMachineTemplate objects. +// +// This controller automatically populates capacity information for AWSMachineTemplate resources +// to enable autoscaling from zero. +// +// See: https://github.com/kubernetes-sigs/cluster-api/blob/main/docs/proposals/20210310-opt-in-autoscaling-from-zero.md +type AWSMachineTemplateReconciler struct { + client.Client + WatchFilterValue string +} + +// +kubebuilder:rbac:groups=infrastructure.cluster.x-k8s.io,resources=awsmachinetemplates,verbs=get;list;watch +// +kubebuilder:rbac:groups=infrastructure.cluster.x-k8s.io,resources=awsmachinetemplates/status,verbs=get;update;patch +// +kubebuilder:rbac:groups=infrastructure.cluster.x-k8s.io,resources=awsclusters,verbs=get;list;watch +// +kubebuilder:rbac:groups=cluster.x-k8s.io,resources=clusters,verbs=get;list;watch +// +kubebuilder:rbac:groups="",resources=events,verbs=get;list;watch;create;update;patch + +// Reconcile populates capacity information for AWSMachineTemplate. +func (r *AWSMachineTemplateReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + log := logger.FromContext(ctx) + + // Fetch the AWSMachineTemplate + awsMachineTemplate := &infrav1.AWSMachineTemplate{} + if err := r.Get(ctx, req.NamespacedName, awsMachineTemplate); err != nil { + if apierrors.IsNotFound(err) { + return ctrl.Result{}, nil + } + return ctrl.Result{}, err + } + + // Skip if capacity is already set + if len(awsMachineTemplate.Status.Capacity) > 0 { + return ctrl.Result{}, nil + } + + // Get instance type from spec + instanceType := awsMachineTemplate.Spec.Template.Spec.InstanceType + if instanceType == "" { + return ctrl.Result{}, nil + } + + // Find the region by checking ownerReferences + region, err := r.getRegion(ctx, awsMachineTemplate) + if err != nil { + return ctrl.Result{}, err + } + if region == "" { + return ctrl.Result{}, nil + } + + // Create global scope for this region + // Reference: exp/instancestate/awsinstancestate_controller.go:68-76 + globalScope, err := scope.NewGlobalScope(scope.GlobalScopeParams{ + ControllerName: "awsmachinetemplate", + Region: region, + }) + if err != nil { + record.Warnf(awsMachineTemplate, "AWSSessionFailed", "Failed to create AWS session for region %q: %v", region, err) + return ctrl.Result{}, nil + } + + // Query instance type capacity + capacity, err := r.getInstanceTypeCapacity(ctx, globalScope, instanceType) + if err != nil { + record.Warnf(awsMachineTemplate, "CapacityQueryFailed", "Failed to query capacity for instance type %q: %v", instanceType, err) + return ctrl.Result{}, nil + } + + // Update status with capacity + awsMachineTemplate.Status.Capacity = capacity + + if err := r.Status().Update(ctx, awsMachineTemplate); err != nil { + return ctrl.Result{}, errors.Wrap(err, "failed to update AWSMachineTemplate status") + } + + log.Info("Successfully populated capacity information", "instanceType", instanceType, "region", region, "capacity", capacity) + return ctrl.Result{}, nil +} + +// getRegion finds the region by checking the template's owner cluster reference. +func (r *AWSMachineTemplateReconciler) getRegion(ctx context.Context, template *infrav1.AWSMachineTemplate) (string, error) { + // Get the owner cluster + cluster, err := util.GetOwnerCluster(ctx, r.Client, template.ObjectMeta) + if err != nil { + return "", err + } + if cluster == nil { + return "", errors.New("no owner cluster found") + } + + // Get region from AWSCluster (standard EC2-based cluster) + if cluster.Spec.InfrastructureRef != nil && cluster.Spec.InfrastructureRef.Kind == "AWSCluster" { + awsCluster := &infrav1.AWSCluster{} + if err := r.Get(ctx, client.ObjectKey{ + Namespace: cluster.Namespace, + Name: cluster.Spec.InfrastructureRef.Name, + }, awsCluster); err != nil { + if !apierrors.IsNotFound(err) { + return "", errors.Wrapf(err, "failed to get AWSCluster %s/%s", cluster.Namespace, cluster.Spec.InfrastructureRef.Name) + } + } else if awsCluster.Spec.Region != "" { + return awsCluster.Spec.Region, nil + } + } + + return "", nil +} + +// getInstanceTypeCapacity queries AWS EC2 API for instance type capacity. +func (r *AWSMachineTemplateReconciler) getInstanceTypeCapacity(ctx context.Context, globalScope *scope.GlobalScope, instanceType string) (corev1.ResourceList, error) { + // Create EC2 client from global scope + ec2Client := ec2.NewFromConfig(globalScope.Session()) + + // Query instance type information + input := &ec2.DescribeInstanceTypesInput{ + InstanceTypes: []ec2types.InstanceType{ec2types.InstanceType(instanceType)}, + } + + result, err := ec2Client.DescribeInstanceTypes(ctx, input) + if err != nil { + return nil, errors.Wrapf(err, "failed to describe instance type %q", instanceType) + } + + if len(result.InstanceTypes) == 0 { + return nil, errors.Errorf("no information found for instance type %q", instanceType) + } + + // Extract capacity information + info := result.InstanceTypes[0] + resourceList := corev1.ResourceList{} + + // CPU + if info.VCpuInfo != nil && info.VCpuInfo.DefaultVCpus != nil { + resourceList[corev1.ResourceCPU] = *resource.NewQuantity(int64(*info.VCpuInfo.DefaultVCpus), resource.DecimalSI) + } + + // Memory + if info.MemoryInfo != nil && info.MemoryInfo.SizeInMiB != nil { + memoryBytes := *info.MemoryInfo.SizeInMiB * 1024 * 1024 + resourceList[corev1.ResourceMemory] = *resource.NewQuantity(memoryBytes, resource.BinarySI) + } + return resourceList, nil +} + +// SetupWithManager sets up the controller with the Manager. +func (r *AWSMachineTemplateReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, options controller.Options) error { + log := logger.FromContext(ctx) + + return ctrl.NewControllerManagedBy(mgr). + For(&infrav1.AWSMachineTemplate{}). + WithOptions(options). + WithEventFilter(predicates.ResourceHasFilterLabel(mgr.GetScheme(), log.GetLogger(), r.WatchFilterValue)). + Complete(r) +} diff --git a/controllers/awsmachinetemplate_controller_unit_test.go b/controllers/awsmachinetemplate_controller_unit_test.go new file mode 100644 index 0000000000..a1a045c099 --- /dev/null +++ b/controllers/awsmachinetemplate_controller_unit_test.go @@ -0,0 +1,295 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controllers + +import ( + "context" + "testing" + + . "github.com/onsi/gomega" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + + infrav1 "sigs.k8s.io/cluster-api-provider-aws/v2/api/v1beta2" + clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" +) + +func TestAWSMachineTemplateReconciler(t *testing.T) { + setupScheme := func() *runtime.Scheme { + scheme := runtime.NewScheme() + _ = infrav1.AddToScheme(scheme) + _ = clusterv1.AddToScheme(scheme) + _ = corev1.AddToScheme(scheme) + return scheme + } + + newFakeClient := func(objs ...client.Object) client.Client { + return fake.NewClientBuilder(). + WithScheme(setupScheme()). + WithObjects(objs...). + WithStatusSubresource(&infrav1.AWSMachineTemplate{}). + Build() + } + + newAWSMachineTemplate := func(name string) *infrav1.AWSMachineTemplate { + return &infrav1.AWSMachineTemplate{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: "default", + }, + Spec: infrav1.AWSMachineTemplateSpec{ + Template: infrav1.AWSMachineTemplateResource{ + Spec: infrav1.AWSMachineSpec{ + InstanceType: "t3.medium", + }, + }, + }, + } + } + + t.Run("getRegion", func(t *testing.T) { + t.Run("should get region from AWSCluster", func(t *testing.T) { + g := NewWithT(t) + template := newAWSMachineTemplate("test-template") + template.OwnerReferences = []metav1.OwnerReference{ + { + APIVersion: clusterv1.GroupVersion.String(), + Kind: "Cluster", + Name: "test-cluster", + }, + } + cluster := &clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-cluster", + Namespace: "default", + }, + Spec: clusterv1.ClusterSpec{ + InfrastructureRef: &corev1.ObjectReference{ + Kind: "AWSCluster", + Name: "test-aws-cluster", + Namespace: "default", + }, + }, + } + awsCluster := &infrav1.AWSCluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-aws-cluster", + Namespace: "default", + }, + Spec: infrav1.AWSClusterSpec{ + Region: "us-west-2", + }, + } + + reconciler := &AWSMachineTemplateReconciler{ + Client: newFakeClient(template, cluster, awsCluster), + } + + region, err := reconciler.getRegion(context.Background(), template) + + g.Expect(err).To(BeNil()) + g.Expect(region).To(Equal("us-west-2")) + }) + + t.Run("should return error when no owner cluster found", func(t *testing.T) { + g := NewWithT(t) + template := newAWSMachineTemplate("test-template") + + reconciler := &AWSMachineTemplateReconciler{ + Client: newFakeClient(template), + } + + region, err := reconciler.getRegion(context.Background(), template) + + g.Expect(err).ToNot(BeNil()) + g.Expect(err.Error()).To(ContainSubstring("no owner cluster found")) + g.Expect(region).To(Equal("")) + }) + + t.Run("should return empty when cluster has no infrastructure ref", func(t *testing.T) { + g := NewWithT(t) + template := newAWSMachineTemplate("test-template") + template.OwnerReferences = []metav1.OwnerReference{ + { + APIVersion: clusterv1.GroupVersion.String(), + Kind: "Cluster", + Name: "test-cluster", + }, + } + cluster := &clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-cluster", + Namespace: "default", + }, + } + + reconciler := &AWSMachineTemplateReconciler{ + Client: newFakeClient(template, cluster), + } + + region, err := reconciler.getRegion(context.Background(), template) + + g.Expect(err).To(BeNil()) + g.Expect(region).To(Equal("")) + }) + + t.Run("should return empty when AWSCluster not found", func(t *testing.T) { + g := NewWithT(t) + template := newAWSMachineTemplate("test-template") + template.OwnerReferences = []metav1.OwnerReference{ + { + APIVersion: clusterv1.GroupVersion.String(), + Kind: "Cluster", + Name: "test-cluster", + }, + } + cluster := &clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-cluster", + Namespace: "default", + }, + Spec: clusterv1.ClusterSpec{ + InfrastructureRef: &corev1.ObjectReference{ + Kind: "AWSCluster", + Name: "test-aws-cluster", + Namespace: "default", + }, + }, + } + + reconciler := &AWSMachineTemplateReconciler{ + Client: newFakeClient(template, cluster), + } + + region, err := reconciler.getRegion(context.Background(), template) + + g.Expect(err).To(BeNil()) + g.Expect(region).To(Equal("")) + }) + }) + + // Note: getInstanceTypeCapacity tests are skipped as they require EC2 client injection + // which would need significant refactoring. The function is tested indirectly through + // integration tests. + + t.Run("Reconcile", func(t *testing.T) { + t.Run("should skip when capacity already set", func(t *testing.T) { + g := NewWithT(t) + template := newAWSMachineTemplate("test-template") + template.Status.Capacity = corev1.ResourceList{ + corev1.ResourceCPU: *resource.NewQuantity(2, resource.DecimalSI), + } + + reconciler := &AWSMachineTemplateReconciler{ + Client: newFakeClient(template), + } + + result, err := reconciler.Reconcile(context.Background(), ctrl.Request{ + NamespacedName: client.ObjectKeyFromObject(template), + }) + + g.Expect(err).To(BeNil()) + g.Expect(result.Requeue).To(BeFalse()) + }) + + t.Run("should skip when instance type is empty", func(t *testing.T) { + g := NewWithT(t) + template := newAWSMachineTemplate("test-template") + template.Spec.Template.Spec.InstanceType = "" + + reconciler := &AWSMachineTemplateReconciler{ + Client: newFakeClient(template), + } + + result, err := reconciler.Reconcile(context.Background(), ctrl.Request{ + NamespacedName: client.ObjectKeyFromObject(template), + }) + + g.Expect(err).To(BeNil()) + g.Expect(result.Requeue).To(BeFalse()) + }) + + t.Run("should return error when no owner cluster", func(t *testing.T) { + g := NewWithT(t) + template := newAWSMachineTemplate("test-template") + + reconciler := &AWSMachineTemplateReconciler{ + Client: newFakeClient(template), + } + + result, err := reconciler.Reconcile(context.Background(), ctrl.Request{ + NamespacedName: client.ObjectKeyFromObject(template), + }) + + g.Expect(err).ToNot(BeNil()) + g.Expect(err.Error()).To(ContainSubstring("no owner cluster found")) + g.Expect(result.Requeue).To(BeFalse()) + }) + + t.Run("should skip when region is empty", func(t *testing.T) { + g := NewWithT(t) + template := newAWSMachineTemplate("test-template") + template.OwnerReferences = []metav1.OwnerReference{ + { + APIVersion: clusterv1.GroupVersion.String(), + Kind: "Cluster", + Name: "test-cluster", + }, + } + cluster := &clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-cluster", + Namespace: "default", + }, + } + + reconciler := &AWSMachineTemplateReconciler{ + Client: newFakeClient(template, cluster), + } + + result, err := reconciler.Reconcile(context.Background(), ctrl.Request{ + NamespacedName: client.ObjectKeyFromObject(template), + }) + + g.Expect(err).To(BeNil()) + g.Expect(result.Requeue).To(BeFalse()) + }) + + t.Run("should return nil when template not found", func(t *testing.T) { + g := NewWithT(t) + + reconciler := &AWSMachineTemplateReconciler{ + Client: newFakeClient(), + } + + result, err := reconciler.Reconcile(context.Background(), ctrl.Request{ + NamespacedName: client.ObjectKey{ + Namespace: "default", + Name: "nonexistent", + }, + }) + + g.Expect(err).To(BeNil()) + g.Expect(result.Requeue).To(BeFalse()) + }) + }) +} diff --git a/main.go b/main.go index a0fcb0563b..3931eefa34 100644 --- a/main.go +++ b/main.go @@ -402,6 +402,15 @@ func setupReconcilersAndWebhooks(ctx context.Context, mgr ctrl.Manager, } } + setupLog.Debug("enabling AWSMachineTemplate controller") + if err := (&controllers.AWSMachineTemplateReconciler{ + Client: mgr.GetClient(), + WatchFilterValue: watchFilterValue, + }).SetupWithManager(ctx, mgr, controller.Options{MaxConcurrentReconciles: awsClusterConcurrency, RecoverPanic: ptr.To[bool](true)}); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "AWSMachineTemplate") + os.Exit(1) + } + if err := (&infrav1.AWSMachineTemplate{}).SetupWebhookWithManager(mgr); err != nil { setupLog.Error(err, "unable to create webhook", "webhook", "AWSMachineTemplate") os.Exit(1) diff --git a/test/e2e/suites/unmanaged/unmanaged_functional_test.go b/test/e2e/suites/unmanaged/unmanaged_functional_test.go index f4d6d42e94..723ae16c49 100644 --- a/test/e2e/suites/unmanaged/unmanaged_functional_test.go +++ b/test/e2e/suites/unmanaged/unmanaged_functional_test.go @@ -352,6 +352,23 @@ var _ = ginkgo.Context("[unmanaged] [functional]", func() { }) Expect(len(workerMachines)).To(Equal(1)) Expect(len(controlPlaneMachines)).To(Equal(1)) + + ginkgo.By("Verifying AWSMachineTemplate capacity is populated for autoscaling from zero") + awsMachineTemplateList := &infrav1.AWSMachineTemplateList{} + err := e2eCtx.Environment.BootstrapClusterProxy.GetClient().List(ctx, awsMachineTemplateList, client.InNamespace(namespace.Name)) + Expect(err).To(BeNil()) + Expect(len(awsMachineTemplateList.Items)).To(BeNumerically(">", 0), "Expected at least one AWSMachineTemplate") + + foundTemplateWithCapacity := false + for _, template := range awsMachineTemplateList.Items { + if len(template.Status.Capacity) > 0 { + foundTemplateWithCapacity = true + ginkgo.By(fmt.Sprintf("AWSMachineTemplate %s has capacity populated: %v", template.Name, template.Status.Capacity)) + Expect(template.Status.Capacity).To(HaveKey(corev1.ResourceCPU), "Expected CPU to be set in capacity") + Expect(template.Status.Capacity).To(HaveKey(corev1.ResourceMemory), "Expected Memory to be set in capacity") + } + } + Expect(foundTemplateWithCapacity).To(BeTrue(), "Expected at least one AWSMachineTemplate to have capacity populated") }) })