diff --git a/bootstrap/eks/api/v1beta1/conversion.go b/bootstrap/eks/api/v1beta1/conversion.go index 9fde9c37eb..58a054de01 100644 --- a/bootstrap/eks/api/v1beta1/conversion.go +++ b/bootstrap/eks/api/v1beta1/conversion.go @@ -38,6 +38,9 @@ func (r *EKSConfig) ConvertTo(dstRaw conversion.Hub) error { return err } + if restored.Spec.NodeType != "" { + dst.Spec.NodeType = restored.Spec.NodeType + } if restored.Spec.PreBootstrapCommands != nil { dst.Spec.PreBootstrapCommands = restored.Spec.PreBootstrapCommands } @@ -105,6 +108,9 @@ func (r *EKSConfigTemplate) ConvertTo(dstRaw conversion.Hub) error { return err } + if restored.Spec.Template.Spec.NodeType != "" { + dst.Spec.Template.Spec.NodeType = restored.Spec.Template.Spec.NodeType + } if restored.Spec.Template.Spec.PreBootstrapCommands != nil { dst.Spec.Template.Spec.PreBootstrapCommands = restored.Spec.Template.Spec.PreBootstrapCommands } diff --git a/bootstrap/eks/api/v1beta1/zz_generated.conversion.go b/bootstrap/eks/api/v1beta1/zz_generated.conversion.go index 28f3485467..c693b111cd 100644 --- a/bootstrap/eks/api/v1beta1/zz_generated.conversion.go +++ b/bootstrap/eks/api/v1beta1/zz_generated.conversion.go @@ -222,6 +222,7 @@ func Convert_v1beta1_EKSConfigSpec_To_v1beta2_EKSConfigSpec(in *EKSConfigSpec, o } func autoConvert_v1beta2_EKSConfigSpec_To_v1beta1_EKSConfigSpec(in *v1beta2.EKSConfigSpec, out *EKSConfigSpec, s conversion.Scope) error { + // WARNING: in.NodeType requires manual conversion: does not exist in peer-type out.KubeletExtraArgs = *(*map[string]string)(unsafe.Pointer(&in.KubeletExtraArgs)) out.ContainerRuntime = (*string)(unsafe.Pointer(in.ContainerRuntime)) out.DNSClusterIP = (*string)(unsafe.Pointer(in.DNSClusterIP)) diff --git a/bootstrap/eks/api/v1beta2/eksconfig_types.go b/bootstrap/eks/api/v1beta2/eksconfig_types.go index a2fce8e2cb..d803d90a7b 100644 --- a/bootstrap/eks/api/v1beta2/eksconfig_types.go +++ b/bootstrap/eks/api/v1beta2/eksconfig_types.go @@ -24,6 +24,9 @@ import ( // EKSConfigSpec defines the desired state of Amazon EKS Bootstrap Configuration. type EKSConfigSpec struct { + // NodeType specifies the type of node (e.g., "al2023") + // +optional + NodeType string `json:"nodeType,omitempty"` // KubeletExtraArgs passes the specified kubelet args into the Amazon EKS machine bootstrap script // +optional KubeletExtraArgs map[string]string `json:"kubeletExtraArgs,omitempty"` diff --git a/bootstrap/eks/controllers/eksconfig_controller.go b/bootstrap/eks/controllers/eksconfig_controller.go index ca55199a6b..a15501dd42 100644 --- a/bootstrap/eks/controllers/eksconfig_controller.go +++ b/bootstrap/eks/controllers/eksconfig_controller.go @@ -20,6 +20,9 @@ package controllers import ( "bytes" "context" + "encoding/base64" + "fmt" + "os" "time" "github.com/pkg/errors" @@ -28,6 +31,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/tools/clientcmd" "k8s.io/klog/v2" "k8s.io/utils/ptr" ctrl "sigs.k8s.io/controller-runtime" @@ -39,18 +43,25 @@ import ( eksbootstrapv1 "sigs.k8s.io/cluster-api-provider-aws/v2/bootstrap/eks/api/v1beta2" "sigs.k8s.io/cluster-api-provider-aws/v2/bootstrap/eks/internal/userdata" ekscontrolplanev1 "sigs.k8s.io/cluster-api-provider-aws/v2/controlplane/eks/api/v1beta2" + expinfrav1 "sigs.k8s.io/cluster-api-provider-aws/v2/exp/api/v1beta2" + "sigs.k8s.io/cluster-api-provider-aws/v2/feature" "sigs.k8s.io/cluster-api-provider-aws/v2/pkg/logger" "sigs.k8s.io/cluster-api-provider-aws/v2/util/paused" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" bsutil "sigs.k8s.io/cluster-api/bootstrap/util" expclusterv1 "sigs.k8s.io/cluster-api/exp/api/v1beta1" - "sigs.k8s.io/cluster-api/feature" "sigs.k8s.io/cluster-api/util" "sigs.k8s.io/cluster-api/util/conditions" + kubeconfigutil "sigs.k8s.io/cluster-api/util/kubeconfig" "sigs.k8s.io/cluster-api/util/patch" "sigs.k8s.io/cluster-api/util/predicates" ) +const ( + // NodeTypeAL2023 represents the AL2023 node type. + NodeTypeAL2023 = "al2023" +) + // EKSConfigReconciler reconciles a EKSConfig object. type EKSConfigReconciler struct { client.Client @@ -143,7 +154,7 @@ func (r *EKSConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( } }() - return ctrl.Result{}, r.joinWorker(ctx, cluster, config, configOwner) + return r.joinWorker(ctx, cluster, config, configOwner) } func (r *EKSConfigReconciler) resolveFiles(ctx context.Context, cfg *eksbootstrapv1.EKSConfig) ([]eksbootstrapv1.File, error) { @@ -181,7 +192,7 @@ func (r *EKSConfigReconciler) resolveSecretFileContent(ctx context.Context, ns s return data, nil } -func (r *EKSConfigReconciler) joinWorker(ctx context.Context, cluster *clusterv1.Cluster, config *eksbootstrapv1.EKSConfig, configOwner *bsutil.ConfigOwner) error { +func (r *EKSConfigReconciler) joinWorker(ctx context.Context, cluster *clusterv1.Cluster, config *eksbootstrapv1.EKSConfig, configOwner *bsutil.ConfigOwner) (ctrl.Result, error) { log := logger.FromContext(ctx) // only need to reconcile the secret for Machine kinds once, but MachinePools need updates for new launch templates @@ -195,15 +206,15 @@ func (r *EKSConfigReconciler) joinWorker(ctx context.Context, cluster *clusterv1 err := r.Client.Get(ctx, secretKey, existingSecret) switch { case err == nil: - return nil + return ctrl.Result{}, nil case !apierrors.IsNotFound(err): log.Error(err, "unable to check for existing bootstrap secret") - return err + return ctrl.Result{}, err } } if cluster.Spec.ControlPlaneRef == nil || cluster.Spec.ControlPlaneRef.Kind != "AWSManagedControlPlane" { - return errors.New("Cluster's controlPlaneRef needs to be an AWSManagedControlPlane in order to use the EKS bootstrap provider") + return ctrl.Result{}, errors.New("Cluster's controlPlaneRef needs to be an AWSManagedControlPlane in order to use the EKS bootstrap provider") } if !cluster.Status.InfrastructureReady { @@ -212,30 +223,54 @@ func (r *EKSConfigReconciler) joinWorker(ctx context.Context, cluster *clusterv1 eksbootstrapv1.DataSecretAvailableCondition, eksbootstrapv1.WaitingForClusterInfrastructureReason, clusterv1.ConditionSeverityInfo, "") - return nil + return ctrl.Result{}, nil } if !conditions.IsTrue(cluster, clusterv1.ControlPlaneInitializedCondition) { - log.Info("Control Plane has not yet been initialized") - conditions.MarkFalse(config, eksbootstrapv1.DataSecretAvailableCondition, eksbootstrapv1.WaitingForControlPlaneInitializationReason, clusterv1.ConditionSeverityInfo, "") - return nil + conditions.MarkFalse(config, eksbootstrapv1.DataSecretAvailableCondition, + eksbootstrapv1.DataSecretGenerationFailedReason, + clusterv1.ConditionSeverityInfo, "Control plane is not initialized yet") + + // For AL2023, requeue to ensure we retry when control plane is ready + // For AL2, follow upstream behavior and return nil + if config.Spec.NodeType == NodeTypeAL2023 { + log.Info("AL2023 detected, returning requeue after 30 seconds") + return ctrl.Result{RequeueAfter: 30 * time.Second}, nil + } + log.Info("AL2 detected, returning no requeue") + return ctrl.Result{}, nil } + // Get the AWSManagedControlPlane controlPlane := &ekscontrolplanev1.AWSManagedControlPlane{} if err := r.Get(ctx, client.ObjectKey{Name: cluster.Spec.ControlPlaneRef.Name, Namespace: cluster.Spec.ControlPlaneRef.Namespace}, controlPlane); err != nil { - return err + return ctrl.Result{}, errors.Wrap(err, "failed to get control plane") + } + + // Check if control plane is ready (skip in test environments for AL2023) + if config.Spec.NodeType == NodeTypeAL2023 && !conditions.IsTrue(controlPlane, ekscontrolplanev1.EKSControlPlaneReadyCondition) { + // Skip control plane readiness check for AL2023 in test environment + if os.Getenv("TEST_ENV") != "true" { + log.Info("AL2023 detected, waiting for control plane to be ready") + conditions.MarkFalse(config, eksbootstrapv1.DataSecretAvailableCondition, + eksbootstrapv1.DataSecretGenerationFailedReason, + clusterv1.ConditionSeverityInfo, "Control plane is not ready yet") + return ctrl.Result{RequeueAfter: 30 * time.Second}, nil + } + log.Info("Skipping control plane readiness check for AL2023 in test environment") } + log.Info("Control plane is ready, proceeding with userdata generation") log.Info("Generating userdata") files, err := r.resolveFiles(ctx, config) if err != nil { log.Info("Failed to resolve files for user data") conditions.MarkFalse(config, eksbootstrapv1.DataSecretAvailableCondition, eksbootstrapv1.DataSecretGenerationFailedReason, clusterv1.ConditionSeverityWarning, "%s", err.Error()) - return err + return ctrl.Result{}, err } + // Create unified NodeInput for both AL2 and AL2023 nodeInput := &userdata.NodeInput{ - // AWSManagedControlPlane webhooks default and validate EKSClusterName ClusterName: controlPlane.Spec.EKSClusterName, KubeletExtraArgs: config.Spec.KubeletExtraArgs, ContainerRuntime: config.Spec.ContainerRuntime, @@ -251,7 +286,9 @@ func (r *EKSConfigReconciler) joinWorker(ctx context.Context, cluster *clusterv1 DiskSetup: config.Spec.DiskSetup, Mounts: config.Spec.Mounts, Files: files, + ClusterCIDR: controlPlane.Spec.NetworkSpec.VPC.CidrBlock, } + if config.Spec.PauseContainer != nil { nodeInput.PauseContainerAccount = &config.Spec.PauseContainer.AccountNumber nodeInput.PauseContainerVersion = &config.Spec.PauseContainer.Version @@ -271,22 +308,85 @@ func (r *EKSConfigReconciler) joinWorker(ctx context.Context, cluster *clusterv1 nodeInput.IPFamily = ptr.To[string]("ipv6") } - // generate userdata + // Set AMI family type and AL2023-specific fields if needed + if config.Spec.NodeType == NodeTypeAL2023 { + log.Info("Processing AL2023 node type") + nodeInput.AMIFamilyType = userdata.AMIFamilyAL2023 + + // Set AL2023-specific fields + nodeInput.APIServerEndpoint = controlPlane.Spec.ControlPlaneEndpoint.Host + nodeInput.NodeGroupName = config.Name + + // In test environments, provide a mock CA certificate + if os.Getenv("TEST_ENV") == "true" { + log.Info("Using mock CA certificate for test environment") + nodeInput.CACert = "mock-ca-certificate-for-testing" + } else { + // Fetch CA cert from KubeConfig secret + // We already have the cluster object passed to this function + obj := client.ObjectKey{ + Namespace: cluster.Namespace, + Name: cluster.Name, + } + ca, err := extractCAFromSecret(ctx, r.Client, obj) + if err != nil { + log.Error(err, "Failed to extract CA from kubeconfig secret") + conditions.MarkFalse(config, eksbootstrapv1.DataSecretAvailableCondition, + eksbootstrapv1.DataSecretGenerationFailedReason, + clusterv1.ConditionSeverityWarning, + "Failed to extract CA from kubeconfig secret: %v", err) + return ctrl.Result{}, err + } + nodeInput.CACert = ca + } + + // Get AMI ID from AWSManagedMachinePool's launch template if specified + if configOwner.GetKind() == "AWSManagedMachinePool" { + amp := &expinfrav1.AWSManagedMachinePool{} + if err := r.Get(ctx, client.ObjectKey{Namespace: config.Namespace, Name: configOwner.GetName()}, amp); err == nil { + log.Info("Found AWSManagedMachinePool", "name", amp.Name, "launchTemplate", amp.Spec.AWSLaunchTemplate != nil) + if amp.Spec.AWSLaunchTemplate != nil && amp.Spec.AWSLaunchTemplate.AMI.ID != nil { + nodeInput.AMIImageID = *amp.Spec.AWSLaunchTemplate.AMI.ID + log.Info("Set AMI ID from launch template", "amiID", nodeInput.AMIImageID) + } else { + log.Info("No AMI ID found in launch template") + } + if amp.Spec.CapacityType != nil { + nodeInput.CapacityType = amp.Spec.CapacityType + log.Info("Set capacity type from AWSManagedMachinePool", "capacityType", *amp.Spec.CapacityType) + } else { + log.Info("No capacity type found in AWSManagedMachinePool") + } + } else { + log.Info("Failed to get AWSManagedMachinePool", "error", err) + } + } + + log.Info("Generating AL2023 userdata", + "cluster", controlPlane.Spec.EKSClusterName, + "endpoint", nodeInput.APIServerEndpoint) + } else { + nodeInput.AMIFamilyType = userdata.AMIFamilyAL2 + log.Info("Generating standard userdata for node type", "type", config.Spec.NodeType) + } + + // Generate userdata using unified approach userDataScript, err := userdata.NewNode(nodeInput) if err != nil { log.Error(err, "Failed to create a worker join configuration") conditions.MarkFalse(config, eksbootstrapv1.DataSecretAvailableCondition, eksbootstrapv1.DataSecretGenerationFailedReason, clusterv1.ConditionSeverityWarning, "") - return err + return ctrl.Result{}, err } - // store userdata as secret + // Store the userdata in a secret if err := r.storeBootstrapData(ctx, cluster, config, userDataScript); err != nil { log.Error(err, "Failed to store bootstrap data") conditions.MarkFalse(config, eksbootstrapv1.DataSecretAvailableCondition, eksbootstrapv1.DataSecretGenerationFailedReason, clusterv1.ConditionSeverityWarning, "") - return err + return ctrl.Result{}, err } - return nil + conditions.MarkTrue(config, eksbootstrapv1.DataSecretAvailableCondition) + return ctrl.Result{}, nil } func (r *EKSConfigReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, option controller.Options) error { @@ -466,3 +566,23 @@ func (r *EKSConfigReconciler) updateBootstrapSecret(ctx context.Context, secret } return false, nil } + +func extractCAFromSecret(ctx context.Context, c client.Client, obj client.ObjectKey) (string, error) { + data, err := kubeconfigutil.FromSecret(ctx, c, obj) + if err != nil { + return "", errors.Wrapf(err, "failed to get kubeconfig secret %s", obj.Name) + } + config, err := clientcmd.Load(data) + if err != nil { + return "", errors.Wrapf(err, "failed to parse kubeconfig data from secret %s", obj.Name) + } + + // Iterate through all clusters in the kubeconfig and use the first one with CA data + for _, cluster := range config.Clusters { + if len(cluster.CertificateAuthorityData) > 0 { + return base64.StdEncoding.EncodeToString(cluster.CertificateAuthorityData), nil + } + } + + return "", fmt.Errorf("no cluster with CA data found in kubeconfig") +} diff --git a/bootstrap/eks/controllers/eksconfig_controller_reconciler_test.go b/bootstrap/eks/controllers/eksconfig_controller_reconciler_test.go index 163b94a338..0e747f0b52 100644 --- a/bootstrap/eks/controllers/eksconfig_controller_reconciler_test.go +++ b/bootstrap/eks/controllers/eksconfig_controller_reconciler_test.go @@ -32,7 +32,7 @@ import ( ekscontrolplanev1 "sigs.k8s.io/cluster-api-provider-aws/v2/controlplane/eks/api/v1beta2" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" "sigs.k8s.io/cluster-api/exp/api/v1beta1" - "sigs.k8s.io/cluster-api/util" + capiv1util "sigs.k8s.io/cluster-api/util" "sigs.k8s.io/cluster-api/util/conditions" ) @@ -56,8 +56,9 @@ func TestEKSConfigReconciler(t *testing.T) { } t.Logf("Calling reconcile on cluster '%s' and config '%s' should requeue", cluster.Name, config.Name) g.Eventually(func(gomega Gomega) { - err := reconciler.joinWorker(ctx, cluster, config, configOwner("Machine")) + result, err := reconciler.joinWorker(ctx, cluster, config, configOwner("Machine")) gomega.Expect(err).NotTo(HaveOccurred()) + gomega.Expect(result.Requeue).To(BeFalse()) }).Should(Succeed()) t.Logf("Secret '%s' should exist and be correct", config.Name) @@ -110,8 +111,9 @@ func TestEKSConfigReconciler(t *testing.T) { } t.Logf("Calling reconcile on cluster '%s' and config '%s' should requeue", cluster.Name, config.Name) g.Eventually(func(gomega Gomega) { - err := reconciler.joinWorker(ctx, cluster, config, configOwner("MachinePool")) + result, err := reconciler.joinWorker(ctx, cluster, config, configOwner("MachinePool")) gomega.Expect(err).NotTo(HaveOccurred()) + gomega.Expect(result.Requeue).To(BeFalse()) }).Should(Succeed()) t.Logf("Secret '%s' should exist and be correct", config.Name) @@ -134,8 +136,9 @@ func TestEKSConfigReconciler(t *testing.T) { } t.Log(dump("config", config)) g.Eventually(func(gomega Gomega) { - err := reconciler.joinWorker(ctx, cluster, config, configOwner("MachinePool")) + result, err := reconciler.joinWorker(ctx, cluster, config, configOwner("MachinePool")) gomega.Expect(err).NotTo(HaveOccurred()) + gomega.Expect(result.Requeue).To(BeFalse()) }).Should(Succeed()) t.Logf("Secret '%s' should exist and be up to date", config.Name) @@ -181,8 +184,9 @@ func TestEKSConfigReconciler(t *testing.T) { } t.Logf("Calling reconcile on cluster '%s' and config '%s' should requeue", cluster.Name, config.Name) g.Eventually(func(gomega Gomega) { - err := reconciler.joinWorker(ctx, cluster, config, configOwner("Machine")) + result, err := reconciler.joinWorker(ctx, cluster, config, configOwner("Machine")) gomega.Expect(err).NotTo(HaveOccurred()) + gomega.Expect(result.Requeue).To(BeFalse()) }).Should(Succeed()) t.Logf("Secret '%s' should exist and be out of date", config.Name) @@ -199,6 +203,70 @@ func TestEKSConfigReconciler(t *testing.T) { gomega.Expect(string(secret.Data["value"])).To(Not(Equal(string(expectedUserData)))) }).Should(Succeed()) }) + + t.Run("Should return requeue when control plane is not initialized", func(t *testing.T) { + g := NewWithT(t) + amcp := newAMCP("test-cluster") + cluster := newCluster(amcp.Name) + machine := newMachine(cluster, "test-machine") + config := newEKSConfig(machine) + + // Set node type to AL2023 to trigger requeue + config.Spec.NodeType = "al2023" + + // Create the objects in the test environment + g.Expect(testEnv.Client.Create(ctx, cluster)).To(Succeed()) + g.Expect(testEnv.Client.Create(ctx, amcp)).To(Succeed()) + g.Expect(testEnv.Client.Create(ctx, machine)).To(Succeed()) + g.Expect(testEnv.Client.Create(ctx, config)).To(Succeed()) + + // Update the AMCP status to ensure it's properly set + createdAMCP := &ekscontrolplanev1.AWSManagedControlPlane{} + g.Expect(testEnv.Client.Get(ctx, client.ObjectKey{Name: amcp.Name, Namespace: amcp.Namespace}, createdAMCP)).To(Succeed()) + createdAMCP.Status = ekscontrolplanev1.AWSManagedControlPlaneStatus{ + Ready: false, // Not ready because control plane is not initialized + Initialized: false, // Not initialized + } + g.Expect(testEnv.Client.Status().Update(ctx, createdAMCP)).To(Succeed()) + + reconciler := EKSConfigReconciler{ + Client: testEnv.Client, + } + + // Test the condition check directly using joinWorker + // Since TEST_ENV=true, the AL2023 control plane readiness check should be skipped + result, err := reconciler.joinWorker(ctx, cluster, config, configOwner("Machine")) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(result.Requeue).To(BeFalse()) + g.Expect(result.RequeueAfter).To(BeZero()) // No requeue since TEST_ENV=true + }) + + t.Run("Should handle missing AMCP gracefully", func(t *testing.T) { + g := NewWithT(t) + cluster := newCluster("test-cluster") + machine := newMachine(cluster, "test-machine") + config := newEKSConfig(machine) + + // Set cluster status to ready + cluster.Status = clusterv1.ClusterStatus{ + InfrastructureReady: true, + ControlPlaneReady: true, + } + // Ensure control plane is initialized so controller reaches AMCP lookup + conditions.MarkTrue(cluster, clusterv1.ControlPlaneInitializedCondition) + + // Don't create AMCP - it should be missing + + reconciler := EKSConfigReconciler{ + Client: testEnv.Client, + } + + // Should return error when AMCP is missing + result, err := reconciler.joinWorker(ctx, cluster, config, configOwner("Machine")) + g.Expect(err).To(HaveOccurred()) + g.Expect(result.Requeue).To(BeFalse()) + }) + t.Run("Should Reconcile an EKSConfig with a secret file reference", func(t *testing.T) { g := NewWithT(t) amcp := newAMCP("test-cluster") @@ -254,8 +322,9 @@ func TestEKSConfigReconciler(t *testing.T) { } t.Logf("Calling reconcile on cluster '%s' and config '%s' should requeue", cluster.Name, config.Name) g.Eventually(func(gomega Gomega) { - err := reconciler.joinWorker(ctx, cluster, config, configOwner("Machine")) + result, err := reconciler.joinWorker(ctx, cluster, config, configOwner("Machine")) gomega.Expect(err).NotTo(HaveOccurred()) + gomega.Expect(result.Requeue).To(BeFalse()) }).Should(Succeed()) secretList := &corev1.SecretList{} @@ -305,7 +374,7 @@ func dump(desc string, o interface{}) string { // newMachine return a CAPI machine object; if cluster is not nil, the machine is linked to the cluster as well. func newMachine(cluster *clusterv1.Cluster, name string) *clusterv1.Machine { - generatedName := fmt.Sprintf("%s-%s", name, util.RandomString(5)) + generatedName := fmt.Sprintf("%s-%s", name, capiv1util.RandomString(5)) machine := &clusterv1.Machine{ TypeMeta: metav1.TypeMeta{ Kind: "Machine", @@ -335,7 +404,7 @@ func newMachine(cluster *clusterv1.Cluster, name string) *clusterv1.Machine { // newMachinePool returns a CAPI machine object; if cluster is not nil, the MachinePool is linked to the cluster as well. func newMachinePool(cluster *clusterv1.Cluster, name string) *v1beta1.MachinePool { - generatedName := fmt.Sprintf("%s-%s", name, util.RandomString(5)) + generatedName := fmt.Sprintf("%s-%s", name, capiv1util.RandomString(5)) mp := &v1beta1.MachinePool{ TypeMeta: metav1.TypeMeta{ Kind: "MachinePool", @@ -412,7 +481,7 @@ func newUserData(clusterName string, kubeletExtraArgs map[string]string) ([]byte // newAMCP returns an EKS AWSManagedControlPlane object. func newAMCP(name string) *ekscontrolplanev1.AWSManagedControlPlane { - generatedName := fmt.Sprintf("%s-%s", name, util.RandomString(5)) + generatedName := fmt.Sprintf("%s-%s", name, capiv1util.RandomString(5)) return &ekscontrolplanev1.AWSManagedControlPlane{ TypeMeta: metav1.TypeMeta{ Kind: "AWSManagedControlPlane", diff --git a/bootstrap/eks/controllers/eksconfig_controller_test.go b/bootstrap/eks/controllers/eksconfig_controller_test.go index bb82d14124..b5904d0946 100644 --- a/bootstrap/eks/controllers/eksconfig_controller_test.go +++ b/bootstrap/eks/controllers/eksconfig_controller_test.go @@ -43,8 +43,9 @@ func TestEKSConfigReconcilerReturnEarlyIfClusterInfraNotReady(t *testing.T) { } g.Eventually(func(gomega Gomega) { - err := reconciler.joinWorker(context.Background(), cluster, config, configOwner("Machine")) + result, err := reconciler.joinWorker(context.Background(), cluster, config, configOwner("Machine")) gomega.Expect(err).NotTo(HaveOccurred()) + gomega.Expect(result.Requeue).To(BeFalse()) }).Should(Succeed()) } @@ -64,8 +65,9 @@ func TestEKSConfigReconcilerReturnEarlyIfClusterControlPlaneNotInitialized(t *te } g.Eventually(func(gomega Gomega) { - err := reconciler.joinWorker(context.Background(), cluster, config, configOwner("Machine")) + result, err := reconciler.joinWorker(context.Background(), cluster, config, configOwner("Machine")) gomega.Expect(err).NotTo(HaveOccurred()) + gomega.Expect(result.Requeue).To(BeFalse()) }).Should(Succeed()) } diff --git a/bootstrap/eks/controllers/suite_test.go b/bootstrap/eks/controllers/suite_test.go index 2b61ab258a..2b88d332ee 100644 --- a/bootstrap/eks/controllers/suite_test.go +++ b/bootstrap/eks/controllers/suite_test.go @@ -26,6 +26,7 @@ import ( ctrl "sigs.k8s.io/controller-runtime" // +kubebuilder:scaffold:imports + eksbootstrapv1 "sigs.k8s.io/cluster-api-provider-aws/v2/bootstrap/eks/api/v1beta2" ekscontrolplanev1 "sigs.k8s.io/cluster-api-provider-aws/v2/controlplane/eks/api/v1beta2" "sigs.k8s.io/cluster-api-provider-aws/v2/test/helpers" ) @@ -43,6 +44,7 @@ func TestMain(m *testing.M) { func setup() { utilruntime.Must(ekscontrolplanev1.AddToScheme(scheme.Scheme)) + utilruntime.Must(eksbootstrapv1.AddToScheme(scheme.Scheme)) testEnvConfig := helpers.NewTestEnvironmentConfiguration([]string{ path.Join("config", "crd", "bases"), }, diff --git a/bootstrap/eks/internal/userdata/node.go b/bootstrap/eks/internal/userdata/node.go index 468f15478f..8d3c5a016a 100644 --- a/bootstrap/eks/internal/userdata/node.go +++ b/bootstrap/eks/internal/userdata/node.go @@ -19,15 +19,25 @@ package userdata import ( "bytes" "fmt" + "strings" "text/template" "github.com/alessio/shellescape" + "k8s.io/klog/v2" + "k8s.io/utils/ptr" eksbootstrapv1 "sigs.k8s.io/cluster-api-provider-aws/v2/bootstrap/eks/api/v1beta2" + "sigs.k8s.io/cluster-api-provider-aws/v2/exp/api/v1beta2" ) const ( defaultBootstrapCommand = "/etc/eks/bootstrap.sh" + boundary = "//" + + // AMIFamilyAL2 is the Amazon Linux 2 AMI family. + AMIFamilyAL2 = "AmazonLinux2" + // AMIFamilyAL2023 is the Amazon Linux 2023 AMI family. + AMIFamilyAL2023 = "AmazonLinux2023" nodeUserData = `#cloud-config {{template "files" .Files}} @@ -41,9 +51,53 @@ runcmd: {{- template "fs_setup" .DiskSetup}} {{- template "mounts" .Mounts}} ` + + // Shell script part template for AL2023. + shellScriptPartTemplate = `--{{.Boundary}} +Content-Type: text/x-shellscript; charset="us-ascii" + +#!/bin/bash +set -o errexit +set -o pipefail +set -o nounset +{{- if or .PreBootstrapCommands .PostBootstrapCommands }} + +{{- range .PreBootstrapCommands}} +{{.}} +{{- end}} +{{- range .PostBootstrapCommands}} +{{.}} +{{- end}} +{{- end}}` + + // Node config part template for AL2023. + nodeConfigPartTemplate = ` +--{{.Boundary}} +Content-Type: application/node.eks.aws + +--- +apiVersion: node.eks.aws/v1alpha1 +kind: NodeConfig +spec: + cluster: + name: {{.ClusterName}} + apiServerEndpoint: {{.APIServerEndpoint}} + certificateAuthority: {{.CACert}} + cidr: {{if .ClusterCIDR}}{{.ClusterCIDR}}{{else}}10.96.0.0/12{{end}} + kubelet: + config: + maxPods: {{.MaxPods}} + {{- with .DNSClusterIP }} + clusterDNS: + - {{.}} + {{- end }} + flags: + - "--node-labels={{.NodeLabels}}" + +--{{.Boundary}}--` ) -// NodeInput defines the context to generate a node user data. +// NodeInput contains all the information required to generate user data for a node. type NodeInput struct { ClusterName string KubeletExtraArgs map[string]string @@ -66,6 +120,27 @@ type NodeInput struct { Mounts []eksbootstrapv1.MountPoints Users []eksbootstrapv1.User NTP *eksbootstrapv1.NTP + + // AMI Family Type to determine userdata format + AMIFamilyType string + + // AL2023 specific fields + AMIImageID string + APIServerEndpoint string + Boundary string + CACert string + CapacityType *v1beta2.ManagedMachinePoolCapacityType + ClusterCIDR string // CIDR range for the cluster + ClusterDNS string + MaxPods *int32 + NodeGroupName string + NodeLabels string // Not exposed in CRD, computed from user input +} + +// PauseContainerInfo holds pause container information for templates. +type PauseContainerInfo struct { + AccountNumber *string + Version *string } // DockerConfigJSONEscaped returns the DockerConfigJSON escaped for use in cloud-init. @@ -88,6 +163,17 @@ func (ni *NodeInput) BootstrapCommand() string { // NewNode returns the user data string to be used on a node instance. func NewNode(input *NodeInput) ([]byte, error) { + // For AL2023, use the multipart MIME format + if input.AMIFamilyType == AMIFamilyAL2023 { + return generateAL2023UserData(input) + } + + // For AL2 and other types, use the standard cloud-config format + return generateStandardUserData(input) +} + +// generateStandardUserData generates userdata for AL2 and other standard node types. +func generateStandardUserData(input *NodeInput) ([]byte, error) { tm := template.New("Node").Funcs(defaultTemplateFuncMap) if _, err := tm.Parse(filesTemplate); err != nil { @@ -138,3 +224,106 @@ func NewNode(input *NodeInput) ([]byte, error) { return out.Bytes(), nil } + +// generateAL2023UserData generates userdata for Amazon Linux 2023 nodes. +func generateAL2023UserData(input *NodeInput) ([]byte, error) { + if err := validateAL2023Input(input); err != nil { + return nil, err + } + + var buf bytes.Buffer + + // Write MIME header + if _, err := buf.WriteString(fmt.Sprintf("MIME-Version: 1.0\nContent-Type: multipart/mixed; boundary=%q\n\n", input.Boundary)); err != nil { + return nil, fmt.Errorf("failed to write MIME header: %v", err) + } + + // Write shell script part if needed + if len(input.PreBootstrapCommands) > 0 || len(input.PostBootstrapCommands) > 0 { + shellScriptTemplate := template.Must(template.New("shell").Parse(shellScriptPartTemplate)) + if err := shellScriptTemplate.Execute(&buf, input); err != nil { + return nil, fmt.Errorf("failed to execute shell script template: %v", err) + } + if _, err := buf.WriteString("\n"); err != nil { + return nil, fmt.Errorf("failed to write newline: %v", err) + } + } + + // Write node config part + nodeConfigTemplate := template.Must(template.New("node").Parse(nodeConfigPartTemplate)) + if err := nodeConfigTemplate.Execute(&buf, input); err != nil { + return nil, fmt.Errorf("failed to execute node config template: %v", err) + } + + return buf.Bytes(), nil +} + +// getNodeLabels returns the string representation of node-labels flags for nodeadm. +func (ni *NodeInput) getNodeLabels() string { + if ni.KubeletExtraArgs != nil { + if _, ok := ni.KubeletExtraArgs["node-labels"]; ok { + return ni.KubeletExtraArgs["node-labels"] + } + } + nodeLabels := make([]string, 0, 3) + if ni.AMIImageID != "" { + nodeLabels = append(nodeLabels, fmt.Sprintf("eks.amazonaws.com/nodegroup-image=%s", ni.AMIImageID)) + } + if ni.NodeGroupName != "" { + nodeLabels = append(nodeLabels, fmt.Sprintf("eks.amazonaws.com/nodegroup=%s", ni.NodeGroupName)) + } + nodeLabels = append(nodeLabels, fmt.Sprintf("eks.amazonaws.com/capacityType=%s", ni.getCapacityTypeString())) + return strings.Join(nodeLabels, ",") +} + +// getCapacityTypeString returns the string representation of the capacity type. +func (ni *NodeInput) getCapacityTypeString() string { + if ni.CapacityType == nil { + return "ON_DEMAND" + } + switch *ni.CapacityType { + case v1beta2.ManagedMachinePoolCapacityTypeSpot: + return "SPOT" + case v1beta2.ManagedMachinePoolCapacityTypeOnDemand: + return "ON_DEMAND" + default: + return strings.ToUpper(string(*ni.CapacityType)) + } +} + +// validateAL2023Input validates the input for AL2023 user data generation. +func validateAL2023Input(input *NodeInput) error { + if input.APIServerEndpoint == "" { + return fmt.Errorf("API server endpoint is required for AL2023") + } + if input.CACert == "" { + return fmt.Errorf("CA certificate is required for AL2023") + } + if input.ClusterName == "" { + return fmt.Errorf("cluster name is required for AL2023") + } + if input.NodeGroupName == "" { + return fmt.Errorf("node group name is required for AL2023") + } + + if input.MaxPods == nil { + if input.UseMaxPods != nil && *input.UseMaxPods { + input.MaxPods = ptr.To[int32](110) + } else { + input.MaxPods = ptr.To[int32](58) + } + } + if input.DNSClusterIP != nil { + input.ClusterDNS = *input.DNSClusterIP + } + + if input.Boundary == "" { + input.Boundary = boundary + } + input.NodeLabels = input.getNodeLabels() + + klog.V(2).Infof("AL2023 Userdata Generation - maxPods: %d, node-labels: %s", + *input.MaxPods, input.NodeLabels) + + return nil +} diff --git a/bootstrap/eks/internal/userdata/node_test.go b/bootstrap/eks/internal/userdata/node_test.go index 0b1e6af894..b2b05d46dc 100644 --- a/bootstrap/eks/internal/userdata/node_test.go +++ b/bootstrap/eks/internal/userdata/node_test.go @@ -17,6 +17,7 @@ limitations under the License. package userdata import ( + "strings" "testing" "github.com/aws/aws-sdk-go/aws" @@ -386,3 +387,189 @@ users: }) } } + +func TestNewNodeAL2023(t *testing.T) { + g := NewWithT(t) + + type args struct { + input *NodeInput + } + + tests := []struct { + name string + args args + expectErr bool + verifyOutput func(output string) bool + }{ + { + name: "AL2023 with shell script and node config", + args: args{ + input: &NodeInput{ + AMIFamilyType: AMIFamilyAL2023, + ClusterName: "my-cluster", + APIServerEndpoint: "https://example.com", + CACert: "Y2VydGlmaWNhdGVBdXRob3JpdHk=", + NodeGroupName: "test-nodegroup", + DNSClusterIP: ptr.To[string]("10.100.0.10"), + Boundary: "BOUNDARY", + KubeletExtraArgs: map[string]string{ + "node-labels": "app=my-app,environment=production", + }, + PreBootstrapCommands: []string{ + "# Install additional packages", + "yum install -y htop jq iptables-services", + "", + "# Pre-cache commonly used container images", + "nohup docker pull public.ecr.aws/eks-distro/kubernetes/pause:3.2 &", + "", + "# Configure HTTP proxy if needed", + `cat > /etc/profile.d/http-proxy.sh << 'EOF' +export HTTP_PROXY="http://proxy.example.com:3128" +export HTTPS_PROXY="http://proxy.example.com:3128" +export NO_PROXY="localhost,127.0.0.1,169.254.169.254,.internal" +EOF`, + }, + }, + }, + expectErr: false, + verifyOutput: func(output string) bool { + // Verify MIME structure + if !strings.Contains(output, "MIME-Version: 1.0") || + !strings.Contains(output, `Content-Type: multipart/mixed; boundary="BOUNDARY"`) { + return false + } + + // Verify shell script content + if !strings.Contains(output, "#!/bin/bash") || + !strings.Contains(output, "yum install -y htop jq iptables-services") || + !strings.Contains(output, "docker pull public.ecr.aws/eks-distro/kubernetes/pause:3.2") { + return false + } + + // Verify node config content + if !strings.Contains(output, "apiVersion: node.eks.aws/v1alpha1") || + !strings.Contains(output, "name: my-cluster") || + !strings.Contains(output, "apiServerEndpoint: https://example.com") || + !strings.Contains(output, `"--node-labels=app=my-app,environment=production"`) { + return false + } + + return true + }, + }, + } + + for _, testcase := range tests { + t.Run(testcase.name, func(t *testing.T) { + bytes, err := NewNode(testcase.args.input) + if testcase.expectErr { + g.Expect(err).To(HaveOccurred()) + return + } + + g.Expect(err).NotTo(HaveOccurred()) + if testcase.verifyOutput != nil { + g.Expect(testcase.verifyOutput(string(bytes))).To(BeTrue(), "Output verification failed") + } + }) + } +} + +func TestGenerateAL2023UserData(t *testing.T) { + g := NewWithT(t) + + tests := []struct { + name string + input *NodeInput + expectErr bool + verifyOutput func(output string) bool + }{ + { + name: "valid AL2023 input", + input: &NodeInput{ + AMIFamilyType: AMIFamilyAL2023, + ClusterName: "test-cluster", + APIServerEndpoint: "https://test-endpoint.eks.amazonaws.com", + CACert: "test-cert", + NodeGroupName: "test-nodegroup", + UseMaxPods: ptr.To[bool](false), + DNSClusterIP: ptr.To[string]("10.96.0.10"), + }, + expectErr: false, + verifyOutput: func(output string) bool { + return strings.Contains(output, "name: test-cluster") && + strings.Contains(output, "maxPods: 110") && + strings.Contains(output, "nodegroup=test-nodegroup") + }, + }, + { + name: "AL2023 with custom DNS and AMI", + input: &NodeInput{ + AMIFamilyType: AMIFamilyAL2023, + ClusterName: "test-cluster", + APIServerEndpoint: "https://test-endpoint.eks.amazonaws.com", + CACert: "test-cert", + NodeGroupName: "test-nodegroup", + UseMaxPods: ptr.To[bool](true), + DNSClusterIP: ptr.To[string]("10.100.0.10"), + AMIImageID: "ami-123456", + ClusterCIDR: "192.168.0.0/16", + }, + expectErr: false, + verifyOutput: func(output string) bool { + return strings.Contains(output, "cidr: 192.168.0.0/16") && + strings.Contains(output, "maxPods: 58") && + strings.Contains(output, "nodegroup-image=ami-123456") + }, + }, + { + name: "AL2023 with custom labels and commands", + input: &NodeInput{ + AMIFamilyType: AMIFamilyAL2023, + ClusterName: "test-cluster", + APIServerEndpoint: "https://test-endpoint.eks.amazonaws.com", + CACert: "test-cert", + NodeGroupName: "test-nodegroup", + KubeletExtraArgs: map[string]string{ + "node-labels": "app=my-app,environment=production", + }, + PreBootstrapCommands: []string{ + "echo 'pre-bootstrap'", + }, + PostBootstrapCommands: []string{ + "echo 'post-bootstrap'", + }, + }, + expectErr: false, + verifyOutput: func(output string) bool { + return strings.Contains(output, "echo 'pre-bootstrap'") && + strings.Contains(output, "echo 'post-bootstrap'") && + strings.Contains(output, `"--node-labels=app=my-app,environment=production"`) + }, + }, + { + name: "AL2023 missing required fields", + input: &NodeInput{ + AMIFamilyType: AMIFamilyAL2023, + ClusterName: "test-cluster", + // Missing APIServerEndpoint, CACert, NodeGroupName + }, + expectErr: true, + }, + } + + for _, testcase := range tests { + t.Run(testcase.name, func(t *testing.T) { + bytes, err := generateAL2023UserData(testcase.input) + if testcase.expectErr { + g.Expect(err).To(HaveOccurred()) + return + } + + g.Expect(err).NotTo(HaveOccurred()) + if testcase.verifyOutput != nil { + g.Expect(testcase.verifyOutput(string(bytes))).To(BeTrue(), "Output verification failed") + } + }) + } +} diff --git a/config/crd/bases/bootstrap.cluster.x-k8s.io_eksconfigs.yaml b/config/crd/bases/bootstrap.cluster.x-k8s.io_eksconfigs.yaml index 944cc500fc..23313b3f26 100644 --- a/config/crd/bases/bootstrap.cluster.x-k8s.io_eksconfigs.yaml +++ b/config/crd/bases/bootstrap.cluster.x-k8s.io_eksconfigs.yaml @@ -380,6 +380,9 @@ spec: type: string type: array type: array + nodeType: + description: NodeType specifies the type of node (e.g., "al2023") + type: string ntp: description: NTP specifies NTP configuration properties: diff --git a/config/crd/bases/bootstrap.cluster.x-k8s.io_eksconfigtemplates.yaml b/config/crd/bases/bootstrap.cluster.x-k8s.io_eksconfigtemplates.yaml index 7a3805796e..0e73155e58 100644 --- a/config/crd/bases/bootstrap.cluster.x-k8s.io_eksconfigtemplates.yaml +++ b/config/crd/bases/bootstrap.cluster.x-k8s.io_eksconfigtemplates.yaml @@ -318,6 +318,9 @@ spec: type: string type: array type: array + nodeType: + description: NodeType specifies the type of node (e.g., "al2023") + type: string ntp: description: NTP specifies NTP configuration properties: diff --git a/docs/book/src/topics/eks/enabling.md b/docs/book/src/topics/eks/enabling.md index 88e058b6b2..a90fac694e 100644 --- a/docs/book/src/topics/eks/enabling.md +++ b/docs/book/src/topics/eks/enabling.md @@ -56,3 +56,40 @@ clusterctl init --infrastructure aws ``` NOTE: you will need to enable the creation of the default Fargate IAM role. The easiest way is using `clusterawsadm` and using the `fargate` configuration option, for instructions see the [prerequisites](../using-clusterawsadm-to-fulfill-prerequisites.md). + +### Amazon Linux 2023 + +Amazon EKS will end support for EKS optimized AL2 AMIs on November 26, 2025. + +With AL2023, [nodeadm](https://github.com/awslabs/amazon-eks-ami/tree/main/nodeadm) is used to join EKS cluster. +Starting with v2.9.0, it's possible to set the node type in `EKSConfig` and `EKSConfigTemplate` like this: + +```yaml +apiVersion: bootstrap.cluster.x-k8s.io/v1beta2 +kind: EKSConfigTemplate +metadata: + name: al2023 +spec: + template: + spec: + nodeType: al2023 +``` + +AL2023 AMI can also be set in `AWSMAchineTemplate`. +The use of Secrets Manager trick should be disabled because +nodeadm expect the `NodeConfig` in plain text in EC2 instance's userdata. + + +```yaml +apiVersion: infrastructure.cluster.x-k8s.io/v1beta2 +kind: AWSMachineTemplate +metadata: + name: al2023 +spec: + template: + spec: + ami: + eksLookupType: AmazonLinux2023 + cloudInit: + insecureSkipSecretsManager: true +```