Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions pkg/cache/cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ const (
// DiscoveredCapacityCacheTTL is the time to drop discovered resource capacity data per-instance type
// if it is not updated by a node creation event or refreshed during controller reconciliation
DiscoveredCapacityCacheTTL = 60 * 24 * time.Hour
// ValidationTTL is time to check authorization errors with validation controller
ValidationTTL = 10 * time.Minute
)

const (
Expand Down
25 changes: 20 additions & 5 deletions pkg/controllers/nodeclass/validation.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ import (
"sigs.k8s.io/karpenter/pkg/cloudprovider"
"sigs.k8s.io/karpenter/pkg/scheduling"

"github.com/aws/aws-sdk-go-v2/aws"
"github.com/aws/aws-sdk-go-v2/service/ec2"
ec2types "github.com/aws/aws-sdk-go-v2/service/ec2/types"
corev1 "k8s.io/api/core/v1"
Expand Down Expand Up @@ -159,7 +160,13 @@ func (v *Validation) validateCreateFleetAuthorization(
) (reason string, requeue bool, err error) {
createFleetInput := instance.GetCreateFleetInput(nodeClass, karpv1.CapacityTypeOnDemand, tags, mockLaunchTemplateConfig())
createFleetInput.DryRun = lo.ToPtr(true)
if _, err := v.ec2api.CreateFleet(ctx, createFleetInput); awserrors.IgnoreDryRunError(err) != nil {
// Adding NopRetryer to avoid aggressive retry when rate limited
if _, err := v.ec2api.CreateFleet(ctx, createFleetInput, func(o *ec2.Options) {
o.Retryer = aws.NopRetryer{}
}); awserrors.IgnoreDryRunError(err) != nil {
if awserrors.IsRateLimitedError(err) {
return "", true, nil
}
if awserrors.IgnoreUnauthorizedOperationError(err) != nil {
// Dry run should only ever return UnauthorizedOperation or DryRunOperation so if we receive any other error
// it would be an unexpected state
Expand All @@ -182,7 +189,13 @@ func (v *Validation) validateCreateLaunchTemplateAuthorization(
}
createLaunchTemplateInput := launchtemplate.GetCreateLaunchTemplateInput(ctx, opts[0], corev1.IPv4Protocol, "")
createLaunchTemplateInput.DryRun = lo.ToPtr(true)
if _, err := v.ec2api.CreateLaunchTemplate(ctx, createLaunchTemplateInput); awserrors.IgnoreDryRunError(err) != nil {
// Adding NopRetryer to avoid aggressive retry when rate limited
if _, err := v.ec2api.CreateLaunchTemplate(ctx, createLaunchTemplateInput, func(o *ec2.Options) {
o.Retryer = aws.NopRetryer{}
}); awserrors.IgnoreDryRunError(err) != nil {
if awserrors.IsRateLimitedError(err) {
return "", true, nil
}
if awserrors.IgnoreUnauthorizedOperationError(err) != nil {
// Dry run should only ever return UnauthorizedOperation or DryRunOperation so if we receive any other error
// it would be an unexpected state
Expand Down Expand Up @@ -230,11 +243,13 @@ func (v *Validation) validateRunInstancesAuthorization(
Tags: runInstancesInput.TagSpecifications[0].Tags,
},
)

if _, err = v.ec2api.RunInstances(ctx, runInstancesInput); awserrors.IgnoreDryRunError(err) != nil {
// Adding NopRetryer to avoid aggressive retry when rate limited
if _, err = v.ec2api.RunInstances(ctx, runInstancesInput, func(o *ec2.Options) {
o.Retryer = aws.NopRetryer{}
}); awserrors.IgnoreDryRunError(err) != nil {
// If we get InstanceProfile NotFound, but we have a resolved instance profile in the status,
// this means there is most likely an eventual consistency issue and we just need to requeue
if awserrors.IsInstanceProfileNotFound(err) {
if awserrors.IsInstanceProfileNotFound(err) || awserrors.IsRateLimitedError(err) {
return "", true, nil
}
if awserrors.IgnoreUnauthorizedOperationError(err) != nil {
Expand Down
18 changes: 18 additions & 0 deletions pkg/errors/errors.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ const (
RunInstancesInvalidParameterValueCode = "InvalidParameterValue"
DryRunOperationErrorCode = "DryRunOperation"
UnauthorizedOperationErrorCode = "UnauthorizedOperation"
RateLimitingErrorCode = "RequestLimitExceeded"
)

var (
Expand Down Expand Up @@ -129,6 +130,23 @@ func IgnoreUnauthorizedOperationError(err error) error {
return err
}

func IsRateLimitedError(err error) bool {
if err == nil {
return false
}
if apiErr, ok := lo.ErrorsAs[smithy.APIError](err); ok {
return apiErr.ErrorCode() == RateLimitingErrorCode
}
return false
}

func IgnoreRateLimitedError(err error) error {
if IsRateLimitedError(err) {
return nil
}
return err
}

// IsUnfulfillableCapacity returns true if the Fleet err means capacity is temporarily unavailable for launching. This
// could be due to account limits, insufficient ec2 capacity, etc.
func IsUnfulfillableCapacity(err ec2types.CreateFleetError) bool {
Expand Down
2 changes: 1 addition & 1 deletion pkg/operator/operator.go
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ func NewOperator(ctx context.Context, operator *operator.Operator) (context.Cont
}
unavailableOfferingsCache := awscache.NewUnavailableOfferings()
ssmCache := cache.New(awscache.SSMCacheTTL, awscache.DefaultCleanupInterval)
validationCache := cache.New(awscache.DefaultTTL, awscache.DefaultCleanupInterval)
validationCache := cache.New(awscache.ValidationTTL, awscache.DefaultCleanupInterval)

subnetProvider := subnet.NewDefaultProvider(ec2api, cache.New(awscache.DefaultTTL, awscache.DefaultCleanupInterval), cache.New(awscache.AvailableIPAddressTTL, awscache.DefaultCleanupInterval), cache.New(awscache.AssociatePublicIPAddressTTL, awscache.DefaultCleanupInterval))
securityGroupProvider := securitygroup.NewDefaultProvider(ec2api, cache.New(awscache.DefaultTTL, awscache.DefaultCleanupInterval))
Expand Down