Skip to content

Commit bc56b00

Browse files
ryan-mistjoshuakguo
authored andcommitted
chore: add NodeClass compatibility filtering
1 parent 69fb7f0 commit bc56b00

File tree

7 files changed

+365
-86
lines changed

7 files changed

+365
-86
lines changed

pkg/controllers/nodeclass/validation.go

Lines changed: 56 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -208,15 +208,17 @@ func (v *Validation) validateCreateLaunchTemplateAuthorization(
208208
nodeClaim *karpv1.NodeClaim,
209209
tags map[string]string,
210210
) (launchTemplate *launchtemplate.LaunchTemplate, result reconcile.Result, err error) {
211-
instanceTypes, err := v.getPrioritizedInstanceTypes(ctx, nodeClass)
211+
nodePools, err := nodepoolutils.ListManaged(ctx, v.kubeClient, v.cloudProvider, nodepoolutils.ForNodeClass(nodeClass))
212212
if err != nil {
213-
return nil, reconcile.Result{}, fmt.Errorf("generating options, %w", err)
213+
return nil, reconcile.Result{}, fmt.Errorf("listing nodepools for nodeclass, %w", err)
214214
}
215-
// pass 1 instance type in EnsureAll to only create 1 launch template
216-
tenancyType, err := v.getTenancyType(ctx, nodeClass)
215+
instanceTypes, err := v.getPrioritizedInstanceTypes(ctx, nodeClass, nodePools)
217216
if err != nil {
218-
return nil, reconcile.Result{}, fmt.Errorf("determining instance tenancy, %w", err)
217+
return nil, reconcile.Result{}, fmt.Errorf("generating options, %w", err)
219218
}
219+
// pass 1 instance type in EnsureAll to only create 1 launch template
220+
tenancyType := getTenancyType(nodePools)
221+
220222
launchTemplates, err := v.launchTemplateProvider.EnsureAll(ctx, nodeClass, nodeClaim, instanceTypes[:1], karpv1.CapacityTypeOnDemand, tags, string(tenancyType))
221223
if err != nil {
222224
if awserrors.IsRateLimitedError(err) || awserrors.IsServerError(err) {
@@ -396,16 +398,20 @@ func getFleetLaunchTemplateConfig(
396398
}
397399
}
398400

399-
func (v *Validation) getPrioritizedInstanceTypes(ctx context.Context, nodeClass *v1.EC2NodeClass) ([]*cloudprovider.InstanceType, error) {
400-
// Select an instance type to use for validation. If NodePools exist for this NodeClass, we'll use an instance type
401-
// selected by one of those NodePools. We should also prioritize an InstanceType which will launch with a non-GPU
402-
// (VariantStandard) AMI, since GPU AMIs may have a larger snapshot size than that supported by the NodeClass'
403-
// blockDeviceMappings.
401+
// getPrioritizedInstanceTypes returns the set of instances which could be launched using this NodeClass based on the
402+
// requirements of linked NodePools. If no NodePools exist for the given NodeClass, this function returns two default
403+
// instance types (one x86_64 and one arm64). If the 2 default instance types are not compatible with the NodeClass,
404+
// this function we'll use an instance type that could be selected with an open NodePool.
405+
func (v *Validation) getPrioritizedInstanceTypes(ctx context.Context, nodeClass *v1.EC2NodeClass, nodePools []*karpv1.NodePool) ([]*cloudprovider.InstanceType, error) {
406+
// We should prioritize an InstanceType which will launch with a non-GPU (VariantStandard) AMI, since GPU
407+
// AMIs may have a larger snapshot size than that supported by the NodeClass' blockDeviceMappings.
404408
// Historical Issue: https://github.com/aws/karpenter-provider-aws/issues/7928
405-
instanceTypes, err := v.getInstanceTypesForNodeClass(ctx, nodeClass)
409+
instanceTypes, err := v.instanceTypeProvider.List(ctx, nodeClass)
406410
if err != nil {
407-
return nil, err
411+
return nil, fmt.Errorf("listing instance types for nodeclass, %w", err)
408412
}
413+
instanceTypes = v.instanceTypeProvider.FilterForNodeClass(instanceTypes, nodeClass)
414+
compatibleInstanceTypes := getNodePoolCompatibleInstanceTypes(instanceTypes, nodePools)
409415

410416
// If there weren't any matching instance types, we should fallback to some defaults. There's an instance type included
411417
// for both x86_64 and arm64 architectures, ensuring that there will be a matching AMI. We also fallback to the default
@@ -414,51 +420,54 @@ func (v *Validation) getPrioritizedInstanceTypes(ctx context.Context, nodeClass
414420
// wouldn't be chosen due to cost in practice. This ensures the behavior matches that on Karpenter v1.3, preventing a
415421
// potential regression for Windows users.
416422
// Tracking issue: https://github.com/aws/karpenter-provider-aws/issues/7985
417-
if len(instanceTypes) == 0 || lo.ContainsBy([]string{
423+
if len(compatibleInstanceTypes) == 0 || lo.ContainsBy([]string{
418424
v1.AMIFamilyWindows2019,
419425
v1.AMIFamilyWindows2022,
420426
v1.AMIFamilyWindows2025,
421427
}, func(family string) bool {
422428
return family == nodeClass.AMIFamily()
423429
}) {
424-
instanceTypes = []*cloudprovider.InstanceType{
425-
{
426-
Name: string(ec2types.InstanceTypeM5Large),
427-
Requirements: scheduling.NewRequirements(append(
428-
lo.Values(amifamily.VariantStandard.Requirements()),
429-
scheduling.NewRequirement(corev1.LabelArchStable, corev1.NodeSelectorOpIn, karpv1.ArchitectureAmd64),
430-
scheduling.NewRequirement(corev1.LabelOSStable, corev1.NodeSelectorOpExists),
431-
scheduling.NewRequirement(corev1.LabelWindowsBuild, corev1.NodeSelectorOpExists),
432-
)...),
433-
},
434-
{
435-
Name: string(ec2types.InstanceTypeM6gLarge),
436-
Requirements: scheduling.NewRequirements(append(
437-
lo.Values(amifamily.VariantStandard.Requirements()),
438-
scheduling.NewRequirement(corev1.LabelArchStable, corev1.NodeSelectorOpIn, karpv1.ArchitectureArm64),
439-
scheduling.NewRequirement(corev1.LabelOSStable, corev1.NodeSelectorOpExists),
440-
scheduling.NewRequirement(corev1.LabelWindowsBuild, corev1.NodeSelectorOpExists),
441-
)...),
442-
},
443-
}
444-
instanceTypes = getAMICompatibleInstanceTypes(instanceTypes, nodeClass)
430+
compatibleInstanceTypes = v.getFallbackInstanceTypes(instanceTypes, nodeClass)
445431
}
446-
447-
return instanceTypes, nil
432+
return getAMICompatibleInstanceTypes(compatibleInstanceTypes, nodeClass), nil
448433
}
449434

450-
// getInstanceTypesForNodeClass returns the set of instances which could be launched using this NodeClass based on the
451-
// requirements of linked NodePools. If no NodePools exist for the given NodeClass, this function returns two default
452-
// instance types (one x86_64 and one arm64).
453-
func (v *Validation) getInstanceTypesForNodeClass(ctx context.Context, nodeClass *v1.EC2NodeClass) ([]*cloudprovider.InstanceType, error) {
454-
instanceTypes, err := v.instanceTypeProvider.List(ctx, nodeClass)
455-
if err != nil {
456-
return nil, fmt.Errorf("listing instance types for nodeclass, %w", err)
435+
func (v *Validation) getFallbackInstanceTypes(instanceTypes []*cloudprovider.InstanceType, nodeClass *v1.EC2NodeClass) []*cloudprovider.InstanceType {
436+
fallbackInstanceTypes := []*cloudprovider.InstanceType{
437+
{
438+
Name: string(ec2types.InstanceTypeM5Large),
439+
Requirements: scheduling.NewRequirements(append(
440+
lo.Values(amifamily.VariantStandard.Requirements()),
441+
scheduling.NewRequirement(corev1.LabelArchStable, corev1.NodeSelectorOpIn, karpv1.ArchitectureAmd64),
442+
scheduling.NewRequirement(corev1.LabelOSStable, corev1.NodeSelectorOpExists),
443+
scheduling.NewRequirement(corev1.LabelWindowsBuild, corev1.NodeSelectorOpExists),
444+
)...),
445+
},
446+
{
447+
Name: string(ec2types.InstanceTypeM6gLarge),
448+
Requirements: scheduling.NewRequirements(append(
449+
lo.Values(amifamily.VariantStandard.Requirements()),
450+
scheduling.NewRequirement(corev1.LabelArchStable, corev1.NodeSelectorOpIn, karpv1.ArchitectureArm64),
451+
scheduling.NewRequirement(corev1.LabelOSStable, corev1.NodeSelectorOpExists),
452+
scheduling.NewRequirement(corev1.LabelWindowsBuild, corev1.NodeSelectorOpExists),
453+
)...),
454+
},
457455
}
458-
nodePools, err := nodepoolutils.ListManaged(ctx, v.kubeClient, v.cloudProvider, nodepoolutils.ForNodeClass(nodeClass))
459-
if err != nil {
460-
return nil, fmt.Errorf("listing nodepools for nodeclass, %w", err)
456+
fallbackInstanceTypes = v.instanceTypeProvider.FilterForNodeClass(fallbackInstanceTypes, nodeClass)
457+
return lo.Ternary(len(fallbackInstanceTypes) == 0, instanceTypes, fallbackInstanceTypes)
458+
}
459+
460+
func getTenancyType(nodePools []*karpv1.NodePool) ec2types.Tenancy {
461+
for _, np := range nodePools {
462+
reqs := scheduling.NewNodeSelectorRequirementsWithMinValues(np.Spec.Template.Spec.Requirements...)
463+
if reqs.Has(v1.LabelInstanceTenancy) && reqs.Get(v1.LabelInstanceTenancy).Has(string(ec2types.TenancyDedicated)) {
464+
return ec2types.TenancyDedicated
465+
}
461466
}
467+
return ec2types.TenancyDefault
468+
}
469+
470+
func getNodePoolCompatibleInstanceTypes(instanceTypes []*cloudprovider.InstanceType, nodePools []*karpv1.NodePool) []*cloudprovider.InstanceType {
462471
var compatibleInstanceTypes []*cloudprovider.InstanceType
463472
names := sets.New[string]()
464473
for _, np := range nodePools {
@@ -477,22 +486,7 @@ func (v *Validation) getInstanceTypesForNodeClass(ctx context.Context, nodeClass
477486
compatibleInstanceTypes = append(compatibleInstanceTypes, it)
478487
}
479488
}
480-
return getAMICompatibleInstanceTypes(compatibleInstanceTypes, nodeClass), nil
481-
}
482-
483-
func (v *Validation) getTenancyType(ctx context.Context, nodeClass *v1.EC2NodeClass) (ec2types.Tenancy, error) {
484-
nodePools, err := nodepoolutils.ListManaged(ctx, v.kubeClient, v.cloudProvider, nodepoolutils.ForNodeClass(nodeClass))
485-
if err != nil {
486-
return "", fmt.Errorf("listing nodepools for nodeclass, %w", err)
487-
}
488-
489-
for _, np := range nodePools {
490-
reqs := scheduling.NewNodeSelectorRequirementsWithMinValues(np.Spec.Template.Spec.Requirements...)
491-
if reqs.Has(v1.LabelInstanceTenancy) && reqs.Get(v1.LabelInstanceTenancy).Has(string(ec2types.TenancyDedicated)) {
492-
return ec2types.TenancyDedicated, nil
493-
}
494-
}
495-
return ec2types.TenancyDefault, nil
489+
return compatibleInstanceTypes
496490
}
497491

498492
func getAMICompatibleInstanceTypes(instanceTypes []*cloudprovider.InstanceType, nodeClass *v1.EC2NodeClass) []*cloudprovider.InstanceType {

pkg/controllers/nodeclass/validation_test.go

Lines changed: 54 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@ package nodeclass_test
1616

1717
import (
1818
"fmt"
19-
"strings"
2019
"time"
2120

2221
"github.com/awslabs/operatorpkg/object"
@@ -272,6 +271,60 @@ var _ = Describe("NodeClass Validation Status Controller", func() {
272271
}, fake.MaxCalls(1))
273272
}, nodeclass.ConditionReasonCreateLaunchTemplateAuthFailed),
274273
)
274+
Context("Windows AMI Validation", func() {
275+
DescribeTable(
276+
"should fallback to static instance types when windows ami is used",
277+
func(family string, terms []v1.AMISelectorTerm, expectedAMIID string) {
278+
// Skip Windows 2025 on versions < 1.35
279+
if family == v1.AMIFamilyWindows2025 && version.MustParseGeneric(awsEnv.VersionProvider.Get(ctx)).Minor() < 35 {
280+
Skip("Windows 2025 requires EKS version 1.35+, current version: " + awsEnv.VersionProvider.Get(ctx))
281+
}
282+
nodeClass.Spec.AMIFamily = lo.ToPtr(family)
283+
nodeClass.Spec.AMISelectorTerms = terms
284+
ExpectApplied(ctx, env.Client, nodeClass)
285+
ExpectObjectReconciled(ctx, env.Client, controller, nodeClass)
286+
runInstancesInput := awsEnv.EC2API.RunInstancesBehavior.CalledWithInput.Pop()
287+
Expect(runInstancesInput.InstanceType).To(Equal(ec2types.InstanceTypeM5Large))
288+
},
289+
Entry("Windows2019 with m5.large", v1.AMIFamilyWindows2019, []v1.AMISelectorTerm{{Alias: "windows2019@latest"}}, "amd64-ami-id"),
290+
Entry("Windows2022 with m5.large", v1.AMIFamilyWindows2022, []v1.AMISelectorTerm{{Alias: "windows2022@latest"}}, "amd64-ami-id"),
291+
Entry("Windows2025 with m6g.large", v1.AMIFamilyWindows2025, []v1.AMISelectorTerm{{Alias: "windows2025@latest"}}, "arm64-ami-id"),
292+
)
293+
})
294+
Context("NodeClass Filtering", func() {
295+
It("should succeed validation using fallback instance types when NodePool requirements are incompatible with NodeClass AMI", func() {
296+
// AL2023 AMI Family is not compatible with a1 instance family
297+
nodeClass.Spec.AMIFamily = lo.ToPtr(v1.AMIFamilyAL2023)
298+
nodeClass.Spec.AMISelectorTerms = []v1.AMISelectorTerm{{Alias: "al2023@latest"}}
299+
300+
nodePool := coretest.NodePool(karpv1.NodePool{Spec: karpv1.NodePoolSpec{Template: karpv1.NodeClaimTemplate{
301+
Spec: karpv1.NodeClaimTemplateSpec{
302+
Requirements: []karpv1.NodeSelectorRequirementWithMinValues{
303+
{
304+
Key: v1.LabelInstanceFamily,
305+
Operator: corev1.NodeSelectorOpIn,
306+
Values: []string{"a1"},
307+
},
308+
},
309+
NodeClassRef: &karpv1.NodeClassReference{
310+
Group: object.GVK(nodeClass).Group,
311+
Kind: object.GVK(nodeClass).Kind,
312+
Name: nodeClass.Name,
313+
},
314+
},
315+
}}})
316+
317+
ExpectApplied(ctx, env.Client, nodePool, nodeClass)
318+
ExpectObjectReconciled(ctx, env.Client, controller, nodeClass)
319+
320+
nodeClass = ExpectExists(ctx, env.Client, nodeClass)
321+
Expect(nodeClass.StatusConditions().Get(v1.ConditionTypeValidationSucceeded).IsTrue()).To(BeTrue())
322+
323+
// Verify a fallback instance type is used for valdiation; not a1
324+
runInstancesInput := awsEnv.EC2API.RunInstancesBehavior.CalledWithInput.Pop()
325+
Expect(string(runInstancesInput.InstanceType)).ToNot(HavePrefix("a1"))
326+
})
327+
})
275328
Context("Instance Type Prioritization Validation", func() {
276329
BeforeEach(func() {
277330
awsEnv.EC2API.DescribeImagesOutput.Set(&ec2.DescribeImagesOutput{
@@ -308,26 +361,6 @@ var _ = Describe("NodeClass Validation Status Controller", func() {
308361
Expect(awsEnv.InstanceTypesProvider.UpdateInstanceTypes(ctx)).To(Succeed())
309362
Expect(awsEnv.InstanceTypesProvider.UpdateInstanceTypeOfferings(ctx)).To(Succeed())
310363
})
311-
DescribeTable(
312-
"should fallback to static instance types when no linked NodePools exist",
313-
func(expectedInstanceType ec2types.InstanceType, expectedAMIID string) {
314-
// Filter out the non-target standard AMI to ensure the right instance is selected, but leave the nvidia AMI to
315-
// test AMI selection.
316-
awsEnv.SSMAPI.Parameters = lo.PickBy(awsEnv.SSMAPI.Parameters, func(_, amiID string) bool {
317-
return amiID == expectedAMIID || strings.Contains(amiID, "nvidia")
318-
})
319-
Expect(len(awsEnv.SSMAPI.Parameters)).To(BeNumerically(">", 1))
320-
321-
ExpectApplied(ctx, env.Client, nodeClass)
322-
ExpectObjectReconciled(ctx, env.Client, controller, nodeClass)
323-
launchTemplateInput := awsEnv.EC2API.CreateLaunchTemplateBehavior.CalledWithInput.Pop()
324-
runInstancesInput := awsEnv.EC2API.RunInstancesBehavior.CalledWithInput.Pop()
325-
Expect(runInstancesInput.InstanceType).To(Equal(expectedInstanceType))
326-
Expect(launchTemplateInput.LaunchTemplateData.ImageId).To(PointTo(Equal(expectedAMIID)))
327-
},
328-
Entry("m5.large", ec2types.InstanceTypeM5Large, "amd64-ami-id"),
329-
Entry("m6g.large", ec2types.InstanceTypeM6gLarge, "arm64-ami-id"),
330-
)
331364
It("should prioritize non-GPU instances", func() {
332365
nodePool := coretest.NodePool(karpv1.NodePool{Spec: karpv1.NodePoolSpec{Template: karpv1.NodeClaimTemplate{
333366
Spec: karpv1.NodeClaimTemplateSpec{
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
/*
2+
Licensed under the Apache License, Version 2.0 (the "License");
3+
you may not use this file except in compliance with the License.
4+
You may obtain a copy of the License at
5+
6+
http://www.apache.org/licenses/LICENSE-2.0
7+
8+
Unless required by applicable law or agreed to in writing, software
9+
distributed under the License is distributed on an "AS IS" BASIS,
10+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
See the License for the specific language governing permissions and
12+
limitations under the License.
13+
*/
14+
15+
package compatibility
16+
17+
import (
18+
"strings"
19+
20+
ec2types "github.com/aws/aws-sdk-go-v2/service/ec2/types"
21+
22+
v1 "github.com/aws/karpenter-provider-aws/pkg/apis/v1"
23+
)
24+
25+
type NodeClass interface {
26+
AMIFamily() string
27+
}
28+
29+
type CompatibleCheck interface {
30+
compatibleCheck(info ec2types.InstanceTypeInfo) bool
31+
}
32+
33+
func IsCompatibleWithNodeClass(info ec2types.InstanceTypeInfo, nodeClass NodeClass) bool {
34+
for _, check := range []CompatibleCheck{
35+
amiFamilyCompatibility(nodeClass.AMIFamily()),
36+
} {
37+
if !check.compatibleCheck(info) {
38+
return false
39+
}
40+
}
41+
return true
42+
}
43+
44+
type amiFamilyCheck struct {
45+
amiFamily string
46+
}
47+
48+
func amiFamilyCompatibility(amiFamily string) CompatibleCheck {
49+
return &amiFamilyCheck{
50+
amiFamily: amiFamily,
51+
}
52+
}
53+
54+
func (c amiFamilyCheck) compatibleCheck(info ec2types.InstanceTypeInfo) bool {
55+
// a1 instance types are not supported with al2023s (https://docs.aws.amazon.com/linux/al2023/ug/system-requirements.html)
56+
if c.amiFamily == v1.AMIFamilyAL2023 && strings.HasPrefix(string(info.InstanceType), "a1.") {
57+
return false
58+
}
59+
return true
60+
}

0 commit comments

Comments
 (0)