-
Notifications
You must be signed in to change notification settings - Fork 622
✨ Add support for EKSConfig LaunchTemplate bootstrapping for AL2023 using nodeadm #5553
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 18 commits
d12643c
b8911f0
1df6059
2974024
2a40624
e1d85eb
8412559
2c8d41b
1521250
1faf230
03750a9
7acf4c3
bf81f67
5d7a6e3
155e074
e8aa2cd
dedb9a7
6493efd
8e9388a
8beddd0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change | ||
---|---|---|---|---|
|
@@ -22,8 +22,20 @@ import ( | |||
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" | ||||
) | ||||
|
||||
// NodeType specifies the type of nodeq | ||||
// +kubebuilder:validation:Enum=al2023 | ||||
type NodeType string | ||||
|
||||
const ( | ||||
// NodeTypeAL2023 represents the AL2023 node type. | ||||
NodeTypeAL2023 NodeType = "al2023" | ||||
) | ||||
Comment on lines
+29
to
+32
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same here a NodeTypeAL2 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We could use the ones you defined for the AMIFamilyType also here
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||||
|
||||
// EKSConfigSpec defines the desired state of Amazon EKS Bootstrap Configuration. | ||||
type EKSConfigSpec struct { | ||||
// NodeType specifies the type of node (e.g., "al2023") | ||||
// +optional | ||||
NodeType NodeType `json:"nodeType,omitempty"` | ||||
Comment on lines
+36
to
+38
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We would also need to specify what the default is (I guess
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The enum is added on the type definition (https://github.com/kubernetes-sigs/cluster-api-provider-aws/pull/5553/files#diff-64e2d3d6a1323c5ae37a84cfc459f926fe7e329e83a4f695df28de89f19a3172R26). Only There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This API is a little confusing to me. We're essentially specifying an AMI family here. We have a field like this already in AWSMachine
I think we shouldn't repeat this information here.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Another thing that's worth noting here is that Amazon Linux 2 will be EOL coming up very soon - june 2026 source: https://aws.amazon.com/amazon-linux-2/faqs/ It seems like EKS has largely moved away from this AL2 support to either AL2023 or bottlerocket. Bottlerocket seems to have its own set backs see kubernetes-sigs/cluster-api#7840 so realistically the only option is going to be AL2023. EDIT:
source: https://awslabs.github.io/amazon-eks-ami/ it looks like they won't be building this AMI |
||||
// KubeletExtraArgs passes the specified kubelet args into the Amazon EKS machine bootstrap script | ||||
// +optional | ||||
KubeletExtraArgs map[string]string `json:"kubeletExtraArgs,omitempty"` | ||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,6 +20,9 @@ package controllers | |
import ( | ||
"bytes" | ||
"context" | ||
"encoding/base64" | ||
"fmt" | ||
"os" | ||
"time" | ||
|
||
"github.com/pkg/errors" | ||
|
@@ -28,6 +31,7 @@ import ( | |
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | ||
"k8s.io/apimachinery/pkg/runtime" | ||
"k8s.io/apimachinery/pkg/types" | ||
"k8s.io/client-go/tools/clientcmd" | ||
"k8s.io/klog/v2" | ||
"k8s.io/utils/ptr" | ||
ctrl "sigs.k8s.io/controller-runtime" | ||
|
@@ -39,14 +43,16 @@ import ( | |
eksbootstrapv1 "sigs.k8s.io/cluster-api-provider-aws/v2/bootstrap/eks/api/v1beta2" | ||
"sigs.k8s.io/cluster-api-provider-aws/v2/bootstrap/eks/internal/userdata" | ||
ekscontrolplanev1 "sigs.k8s.io/cluster-api-provider-aws/v2/controlplane/eks/api/v1beta2" | ||
expinfrav1 "sigs.k8s.io/cluster-api-provider-aws/v2/exp/api/v1beta2" | ||
"sigs.k8s.io/cluster-api-provider-aws/v2/feature" | ||
"sigs.k8s.io/cluster-api-provider-aws/v2/pkg/logger" | ||
"sigs.k8s.io/cluster-api-provider-aws/v2/util/paused" | ||
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" | ||
bsutil "sigs.k8s.io/cluster-api/bootstrap/util" | ||
expclusterv1 "sigs.k8s.io/cluster-api/exp/api/v1beta1" | ||
"sigs.k8s.io/cluster-api/feature" | ||
"sigs.k8s.io/cluster-api/util" | ||
"sigs.k8s.io/cluster-api/util/conditions" | ||
kubeconfigutil "sigs.k8s.io/cluster-api/util/kubeconfig" | ||
"sigs.k8s.io/cluster-api/util/patch" | ||
"sigs.k8s.io/cluster-api/util/predicates" | ||
) | ||
|
@@ -143,7 +149,7 @@ func (r *EKSConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( | |
} | ||
}() | ||
|
||
return ctrl.Result{}, r.joinWorker(ctx, cluster, config, configOwner) | ||
return r.joinWorker(ctx, cluster, config, configOwner) | ||
} | ||
|
||
func (r *EKSConfigReconciler) resolveFiles(ctx context.Context, cfg *eksbootstrapv1.EKSConfig) ([]eksbootstrapv1.File, error) { | ||
|
@@ -181,7 +187,7 @@ func (r *EKSConfigReconciler) resolveSecretFileContent(ctx context.Context, ns s | |
return data, nil | ||
} | ||
|
||
func (r *EKSConfigReconciler) joinWorker(ctx context.Context, cluster *clusterv1.Cluster, config *eksbootstrapv1.EKSConfig, configOwner *bsutil.ConfigOwner) error { | ||
func (r *EKSConfigReconciler) joinWorker(ctx context.Context, cluster *clusterv1.Cluster, config *eksbootstrapv1.EKSConfig, configOwner *bsutil.ConfigOwner) (ctrl.Result, error) { | ||
log := logger.FromContext(ctx) | ||
|
||
// only need to reconcile the secret for Machine kinds once, but MachinePools need updates for new launch templates | ||
|
@@ -195,15 +201,15 @@ func (r *EKSConfigReconciler) joinWorker(ctx context.Context, cluster *clusterv1 | |
err := r.Client.Get(ctx, secretKey, existingSecret) | ||
switch { | ||
case err == nil: | ||
return nil | ||
return ctrl.Result{}, nil | ||
case !apierrors.IsNotFound(err): | ||
log.Error(err, "unable to check for existing bootstrap secret") | ||
return err | ||
return ctrl.Result{}, err | ||
} | ||
} | ||
|
||
if cluster.Spec.ControlPlaneRef == nil || cluster.Spec.ControlPlaneRef.Kind != "AWSManagedControlPlane" { | ||
return errors.New("Cluster's controlPlaneRef needs to be an AWSManagedControlPlane in order to use the EKS bootstrap provider") | ||
return ctrl.Result{}, errors.New("Cluster's controlPlaneRef needs to be an AWSManagedControlPlane in order to use the EKS bootstrap provider") | ||
} | ||
|
||
if !cluster.Status.InfrastructureReady { | ||
|
@@ -212,30 +218,54 @@ func (r *EKSConfigReconciler) joinWorker(ctx context.Context, cluster *clusterv1 | |
eksbootstrapv1.DataSecretAvailableCondition, | ||
eksbootstrapv1.WaitingForClusterInfrastructureReason, | ||
clusterv1.ConditionSeverityInfo, "") | ||
return nil | ||
return ctrl.Result{}, nil | ||
} | ||
|
||
if !conditions.IsTrue(cluster, clusterv1.ControlPlaneInitializedCondition) { | ||
log.Info("Control Plane has not yet been initialized") | ||
conditions.MarkFalse(config, eksbootstrapv1.DataSecretAvailableCondition, eksbootstrapv1.WaitingForControlPlaneInitializationReason, clusterv1.ConditionSeverityInfo, "") | ||
return nil | ||
conditions.MarkFalse(config, eksbootstrapv1.DataSecretAvailableCondition, | ||
eksbootstrapv1.DataSecretGenerationFailedReason, | ||
clusterv1.ConditionSeverityInfo, "Control plane is not initialized yet") | ||
|
||
// For AL2023, requeue to ensure we retry when control plane is ready | ||
// For AL2, follow upstream behavior and return nil | ||
if config.Spec.NodeType == eksbootstrapv1.NodeTypeAL2023 { | ||
log.Info("AL2023 detected, returning requeue after 30 seconds") | ||
return ctrl.Result{RequeueAfter: 30 * time.Second}, nil | ||
} | ||
log.Info("AL2 detected, returning no requeue") | ||
return ctrl.Result{}, nil | ||
} | ||
|
||
// Get the AWSManagedControlPlane | ||
controlPlane := &ekscontrolplanev1.AWSManagedControlPlane{} | ||
if err := r.Get(ctx, client.ObjectKey{Name: cluster.Spec.ControlPlaneRef.Name, Namespace: cluster.Spec.ControlPlaneRef.Namespace}, controlPlane); err != nil { | ||
return err | ||
return ctrl.Result{}, errors.Wrap(err, "failed to get control plane") | ||
} | ||
|
||
// Check if control plane is ready (skip in test environments for AL2023) | ||
if config.Spec.NodeType == eksbootstrapv1.NodeTypeAL2023 && !conditions.IsTrue(controlPlane, ekscontrolplanev1.EKSControlPlaneReadyCondition) { | ||
// Skip control plane readiness check for AL2023 in test environment | ||
if os.Getenv("TEST_ENV") != "true" { | ||
log.Info("AL2023 detected, waiting for control plane to be ready") | ||
conditions.MarkFalse(config, eksbootstrapv1.DataSecretAvailableCondition, | ||
eksbootstrapv1.DataSecretGenerationFailedReason, | ||
clusterv1.ConditionSeverityInfo, "Control plane is not ready yet") | ||
return ctrl.Result{RequeueAfter: 30 * time.Second}, nil | ||
} | ||
log.Info("Skipping control plane readiness check for AL2023 in test environment") | ||
} | ||
log.Info("Control plane is ready, proceeding with userdata generation") | ||
|
||
log.Info("Generating userdata") | ||
files, err := r.resolveFiles(ctx, config) | ||
if err != nil { | ||
log.Info("Failed to resolve files for user data") | ||
conditions.MarkFalse(config, eksbootstrapv1.DataSecretAvailableCondition, eksbootstrapv1.DataSecretGenerationFailedReason, clusterv1.ConditionSeverityWarning, "%s", err.Error()) | ||
return err | ||
return ctrl.Result{}, err | ||
} | ||
|
||
// Create unified NodeInput for both AL2 and AL2023 | ||
nodeInput := &userdata.NodeInput{ | ||
// AWSManagedControlPlane webhooks default and validate EKSClusterName | ||
ClusterName: controlPlane.Spec.EKSClusterName, | ||
KubeletExtraArgs: config.Spec.KubeletExtraArgs, | ||
ContainerRuntime: config.Spec.ContainerRuntime, | ||
|
@@ -251,7 +281,9 @@ func (r *EKSConfigReconciler) joinWorker(ctx context.Context, cluster *clusterv1 | |
DiskSetup: config.Spec.DiskSetup, | ||
Mounts: config.Spec.Mounts, | ||
Files: files, | ||
ClusterCIDR: controlPlane.Spec.NetworkSpec.VPC.CidrBlock, | ||
AmitSahastra marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
|
||
if config.Spec.PauseContainer != nil { | ||
nodeInput.PauseContainerAccount = &config.Spec.PauseContainer.AccountNumber | ||
nodeInput.PauseContainerVersion = &config.Spec.PauseContainer.Version | ||
|
@@ -271,22 +303,85 @@ func (r *EKSConfigReconciler) joinWorker(ctx context.Context, cluster *clusterv1 | |
nodeInput.IPFamily = ptr.To[string]("ipv6") | ||
} | ||
|
||
// generate userdata | ||
// Set AMI family type and AL2023-specific fields if needed | ||
if config.Spec.NodeType == eksbootstrapv1.NodeTypeAL2023 { | ||
log.Info("Processing AL2023 node type") | ||
nodeInput.AMIFamilyType = userdata.AMIFamilyAL2023 | ||
|
||
// Set AL2023-specific fields | ||
nodeInput.APIServerEndpoint = controlPlane.Spec.ControlPlaneEndpoint.Host | ||
nodeInput.NodeGroupName = config.Name | ||
|
||
// In test environments, provide a mock CA certificate | ||
if os.Getenv("TEST_ENV") == "true" { | ||
log.Info("Using mock CA certificate for test environment") | ||
nodeInput.CACert = "mock-ca-certificate-for-testing" | ||
} else { | ||
// Fetch CA cert from KubeConfig secret | ||
// We already have the cluster object passed to this function | ||
obj := client.ObjectKey{ | ||
Namespace: cluster.Namespace, | ||
Name: cluster.Name, | ||
} | ||
ca, err := extractCAFromSecret(ctx, r.Client, obj) | ||
if err != nil { | ||
log.Error(err, "Failed to extract CA from kubeconfig secret") | ||
conditions.MarkFalse(config, eksbootstrapv1.DataSecretAvailableCondition, | ||
eksbootstrapv1.DataSecretGenerationFailedReason, | ||
clusterv1.ConditionSeverityWarning, | ||
"Failed to extract CA from kubeconfig secret: %v", err) | ||
return ctrl.Result{}, err | ||
} | ||
nodeInput.CACert = ca | ||
} | ||
|
||
// Get AMI ID from AWSManagedMachinePool's launch template if specified | ||
if configOwner.GetKind() == "AWSManagedMachinePool" { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there any reason that users who don' use managed machine pools can't set AMI IDs like users of |
||
amp := &expinfrav1.AWSManagedMachinePool{} | ||
if err := r.Get(ctx, client.ObjectKey{Namespace: config.Namespace, Name: configOwner.GetName()}, amp); err == nil { | ||
log.Info("Found AWSManagedMachinePool", "name", amp.Name, "launchTemplate", amp.Spec.AWSLaunchTemplate != nil) | ||
if amp.Spec.AWSLaunchTemplate != nil && amp.Spec.AWSLaunchTemplate.AMI.ID != nil { | ||
nodeInput.AMIImageID = *amp.Spec.AWSLaunchTemplate.AMI.ID | ||
log.Info("Set AMI ID from launch template", "amiID", nodeInput.AMIImageID) | ||
} else { | ||
log.Info("No AMI ID found in launch template") | ||
} | ||
if amp.Spec.CapacityType != nil { | ||
nodeInput.CapacityType = amp.Spec.CapacityType | ||
log.Info("Set capacity type from AWSManagedMachinePool", "capacityType", *amp.Spec.CapacityType) | ||
} else { | ||
log.Info("No capacity type found in AWSManagedMachinePool") | ||
} | ||
} else { | ||
log.Info("Failed to get AWSManagedMachinePool", "error", err) | ||
} | ||
} | ||
|
||
log.Info("Generating AL2023 userdata", | ||
"cluster", controlPlane.Spec.EKSClusterName, | ||
"endpoint", nodeInput.APIServerEndpoint) | ||
} else { | ||
nodeInput.AMIFamilyType = userdata.AMIFamilyAL2 | ||
log.Info("Generating standard userdata for node type", "type", config.Spec.NodeType) | ||
} | ||
|
||
// Generate userdata using unified approach | ||
userDataScript, err := userdata.NewNode(nodeInput) | ||
if err != nil { | ||
log.Error(err, "Failed to create a worker join configuration") | ||
conditions.MarkFalse(config, eksbootstrapv1.DataSecretAvailableCondition, eksbootstrapv1.DataSecretGenerationFailedReason, clusterv1.ConditionSeverityWarning, "") | ||
return err | ||
return ctrl.Result{}, err | ||
} | ||
|
||
// store userdata as secret | ||
// Store the userdata in a secret | ||
if err := r.storeBootstrapData(ctx, cluster, config, userDataScript); err != nil { | ||
log.Error(err, "Failed to store bootstrap data") | ||
conditions.MarkFalse(config, eksbootstrapv1.DataSecretAvailableCondition, eksbootstrapv1.DataSecretGenerationFailedReason, clusterv1.ConditionSeverityWarning, "") | ||
return err | ||
return ctrl.Result{}, err | ||
} | ||
|
||
return nil | ||
conditions.MarkTrue(config, eksbootstrapv1.DataSecretAvailableCondition) | ||
return ctrl.Result{}, nil | ||
} | ||
|
||
func (r *EKSConfigReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, option controller.Options) error { | ||
|
@@ -466,3 +561,23 @@ func (r *EKSConfigReconciler) updateBootstrapSecret(ctx context.Context, secret | |
} | ||
return false, nil | ||
} | ||
|
||
func extractCAFromSecret(ctx context.Context, c client.Client, obj client.ObjectKey) (string, error) { | ||
data, err := kubeconfigutil.FromSecret(ctx, c, obj) | ||
if err != nil { | ||
return "", errors.Wrapf(err, "failed to get kubeconfig secret %s", obj.Name) | ||
} | ||
config, err := clientcmd.Load(data) | ||
if err != nil { | ||
return "", errors.Wrapf(err, "failed to parse kubeconfig data from secret %s", obj.Name) | ||
} | ||
|
||
// Iterate through all clusters in the kubeconfig and use the first one with CA data | ||
for _, cluster := range config.Clusters { | ||
if len(cluster.CertificateAuthorityData) > 0 { | ||
return base64.StdEncoding.EncodeToString(cluster.CertificateAuthorityData), nil | ||
} | ||
} | ||
|
||
return "", fmt.Errorf("no cluster with CA data found in kubeconfig") | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
See my comments below regarding the naming of this field