From 0a70a04872ddd4549fbe8111252c1042a6c3a688 Mon Sep 17 00:00:00 2001 From: Prucek Date: Thu, 18 Jul 2024 14:23:05 +0200 Subject: [PATCH 1/3] api: add WaitingTimeout in SpotMarketOptions --- api/v1beta1/types.go | 5 +++++ api/v1beta1/zz_generated.conversion.go | 2 ++ api/v1beta2/types.go | 7 +++++++ ...ane.cluster.x-k8s.io_awsmanagedcontrolplanes.yaml | 12 ++++++++++++ .../infrastructure.cluster.x-k8s.io_awsclusters.yaml | 12 ++++++++++++ ...rastructure.cluster.x-k8s.io_awsmachinepools.yaml | 12 ++++++++++++ .../infrastructure.cluster.x-k8s.io_awsmachines.yaml | 12 ++++++++++++ ...ructure.cluster.x-k8s.io_awsmachinetemplates.yaml | 12 ++++++++++++ ...ture.cluster.x-k8s.io_awsmanagedmachinepools.yaml | 12 ++++++++++++ 9 files changed, 86 insertions(+) diff --git a/api/v1beta1/types.go b/api/v1beta1/types.go index fe6510380b..f4253d4c3c 100644 --- a/api/v1beta1/types.go +++ b/api/v1beta1/types.go @@ -17,6 +17,8 @@ limitations under the License. package v1beta1 import ( + "time" + "k8s.io/apimachinery/pkg/util/sets" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" @@ -280,6 +282,9 @@ type SpotMarketOptions struct { // +optional // +kubebuilder:validation:pattern="^[0-9]+(\.[0-9]+)?$" MaxPrice *string `json:"maxPrice,omitempty"` + // WaitingTimeout defines the maximum time the user is willing to wait for acquiring a Spot VM instance, after that it will fallback to a regular on-demand instance. + // +optional + WaitingTimeout time.Duration `json:"waitingTimeout,omitempty"` } // EKSAMILookupType specifies which AWS AMI to use for a AWSMachine and AWSMachinePool. diff --git a/api/v1beta1/zz_generated.conversion.go b/api/v1beta1/zz_generated.conversion.go index 2d1641f424..3ff7b69bbb 100644 --- a/api/v1beta1/zz_generated.conversion.go +++ b/api/v1beta1/zz_generated.conversion.go @@ -2222,6 +2222,7 @@ func Convert_v1beta2_SecurityGroup_To_v1beta1_SecurityGroup(in *v1beta2.Security func autoConvert_v1beta1_SpotMarketOptions_To_v1beta2_SpotMarketOptions(in *SpotMarketOptions, out *v1beta2.SpotMarketOptions, s conversion.Scope) error { out.MaxPrice = (*string)(unsafe.Pointer(in.MaxPrice)) + out.WaitingTimeout = time.Duration(in.WaitingTimeout) return nil } @@ -2232,6 +2233,7 @@ func Convert_v1beta1_SpotMarketOptions_To_v1beta2_SpotMarketOptions(in *SpotMark func autoConvert_v1beta2_SpotMarketOptions_To_v1beta1_SpotMarketOptions(in *v1beta2.SpotMarketOptions, out *SpotMarketOptions, s conversion.Scope) error { out.MaxPrice = (*string)(unsafe.Pointer(in.MaxPrice)) + out.WaitingTimeout = time.Duration(in.WaitingTimeout) return nil } diff --git a/api/v1beta2/types.go b/api/v1beta2/types.go index 978d5310f2..cb55d1aea9 100644 --- a/api/v1beta2/types.go +++ b/api/v1beta2/types.go @@ -17,7 +17,11 @@ limitations under the License. package v1beta2 import ( +<<<<<<< HEAD "strings" +======= + "time" +>>>>>>> adad1bff1 (api: add WaitingTimeout in SpotMarketOptions) "k8s.io/apimachinery/pkg/util/sets" @@ -426,6 +430,9 @@ type SpotMarketOptions struct { // +optional // +kubebuilder:validation:pattern="^[0-9]+(\.[0-9]+)?$" MaxPrice *string `json:"maxPrice,omitempty"` + // WaitingTimeout defines the maximum time the user is willing to wait for acquiring a Spot VM instance, after that it will fallback to a regular on-demand instance. + // +optional + WaitingTimeout time.Duration `json:"waitingTimeout,omitempty"` } // EKSAMILookupType specifies which AWS AMI to use for a AWSMachine and AWSMachinePool. diff --git a/config/crd/bases/controlplane.cluster.x-k8s.io_awsmanagedcontrolplanes.yaml b/config/crd/bases/controlplane.cluster.x-k8s.io_awsmanagedcontrolplanes.yaml index af854b225e..7a8a4133fd 100644 --- a/config/crd/bases/controlplane.cluster.x-k8s.io_awsmanagedcontrolplanes.yaml +++ b/config/crd/bases/controlplane.cluster.x-k8s.io_awsmanagedcontrolplanes.yaml @@ -1378,6 +1378,12 @@ spec: description: MaxPrice defines the maximum price the user is willing to pay for Spot VM instances type: string + waitingTimeout: + description: WaitingTimeout defines the maximum time the user + is willing to wait for acquiring a Spot VM instance, after + that it will fallback to a regular on-demand instance. + format: int64 + type: integer type: object sshKeyName: description: The name of the SSH key pair. @@ -3439,6 +3445,12 @@ spec: description: MaxPrice defines the maximum price the user is willing to pay for Spot VM instances type: string + waitingTimeout: + description: WaitingTimeout defines the maximum time the user + is willing to wait for acquiring a Spot VM instance, after + that it will fallback to a regular on-demand instance. + format: int64 + type: integer type: object sshKeyName: description: The name of the SSH key pair. diff --git a/config/crd/bases/infrastructure.cluster.x-k8s.io_awsclusters.yaml b/config/crd/bases/infrastructure.cluster.x-k8s.io_awsclusters.yaml index 7d6cc0a025..c8456609b2 100644 --- a/config/crd/bases/infrastructure.cluster.x-k8s.io_awsclusters.yaml +++ b/config/crd/bases/infrastructure.cluster.x-k8s.io_awsclusters.yaml @@ -573,6 +573,12 @@ spec: description: MaxPrice defines the maximum price the user is willing to pay for Spot VM instances type: string + waitingTimeout: + description: WaitingTimeout defines the maximum time the user + is willing to wait for acquiring a Spot VM instance, after + that it will fallback to a regular on-demand instance. + format: int64 + type: integer type: object sshKeyName: description: The name of the SSH key pair. @@ -2353,6 +2359,12 @@ spec: description: MaxPrice defines the maximum price the user is willing to pay for Spot VM instances type: string + waitingTimeout: + description: WaitingTimeout defines the maximum time the user + is willing to wait for acquiring a Spot VM instance, after + that it will fallback to a regular on-demand instance. + format: int64 + type: integer type: object sshKeyName: description: The name of the SSH key pair. diff --git a/config/crd/bases/infrastructure.cluster.x-k8s.io_awsmachinepools.yaml b/config/crd/bases/infrastructure.cluster.x-k8s.io_awsmachinepools.yaml index e70f544535..2418582e78 100644 --- a/config/crd/bases/infrastructure.cluster.x-k8s.io_awsmachinepools.yaml +++ b/config/crd/bases/infrastructure.cluster.x-k8s.io_awsmachinepools.yaml @@ -221,6 +221,12 @@ spec: description: MaxPrice defines the maximum price the user is willing to pay for Spot VM instances type: string + waitingTimeout: + description: WaitingTimeout defines the maximum time the user + is willing to wait for acquiring a Spot VM instance, after + that it will fallback to a regular on-demand instance. + format: int64 + type: integer type: object sshKeyName: description: |- @@ -851,6 +857,12 @@ spec: description: MaxPrice defines the maximum price the user is willing to pay for Spot VM instances type: string + waitingTimeout: + description: WaitingTimeout defines the maximum time the user + is willing to wait for acquiring a Spot VM instance, after + that it will fallback to a regular on-demand instance. + format: int64 + type: integer type: object sshKeyName: description: |- diff --git a/config/crd/bases/infrastructure.cluster.x-k8s.io_awsmachines.yaml b/config/crd/bases/infrastructure.cluster.x-k8s.io_awsmachines.yaml index c02466fa59..1db8fdf81f 100644 --- a/config/crd/bases/infrastructure.cluster.x-k8s.io_awsmachines.yaml +++ b/config/crd/bases/infrastructure.cluster.x-k8s.io_awsmachines.yaml @@ -330,6 +330,12 @@ spec: description: MaxPrice defines the maximum price the user is willing to pay for Spot VM instances type: string + waitingTimeout: + description: WaitingTimeout defines the maximum time the user + is willing to wait for acquiring a Spot VM instance, after that + it will fallback to a regular on-demand instance. + format: int64 + type: integer type: object sshKeyName: description: SSHKeyName is the name of the ssh key to attach to the @@ -1047,6 +1053,12 @@ spec: description: MaxPrice defines the maximum price the user is willing to pay for Spot VM instances type: string + waitingTimeout: + description: WaitingTimeout defines the maximum time the user + is willing to wait for acquiring a Spot VM instance, after that + it will fallback to a regular on-demand instance. + format: int64 + type: integer type: object sshKeyName: description: SSHKeyName is the name of the ssh key to attach to the diff --git a/config/crd/bases/infrastructure.cluster.x-k8s.io_awsmachinetemplates.yaml b/config/crd/bases/infrastructure.cluster.x-k8s.io_awsmachinetemplates.yaml index 501a837555..7f058e663d 100644 --- a/config/crd/bases/infrastructure.cluster.x-k8s.io_awsmachinetemplates.yaml +++ b/config/crd/bases/infrastructure.cluster.x-k8s.io_awsmachinetemplates.yaml @@ -345,6 +345,12 @@ spec: description: MaxPrice defines the maximum price the user is willing to pay for Spot VM instances type: string + waitingTimeout: + description: WaitingTimeout defines the maximum time the + user is willing to wait for acquiring a Spot VM instance, + after that it will fallback to a regular on-demand instance. + format: int64 + type: integer type: object sshKeyName: description: SSHKeyName is the name of the ssh key to attach @@ -981,6 +987,12 @@ spec: description: MaxPrice defines the maximum price the user is willing to pay for Spot VM instances type: string + waitingTimeout: + description: WaitingTimeout defines the maximum time the + user is willing to wait for acquiring a Spot VM instance, + after that it will fallback to a regular on-demand instance. + format: int64 + type: integer type: object sshKeyName: description: SSHKeyName is the name of the ssh key to attach diff --git a/config/crd/bases/infrastructure.cluster.x-k8s.io_awsmanagedmachinepools.yaml b/config/crd/bases/infrastructure.cluster.x-k8s.io_awsmanagedmachinepools.yaml index 008bfd9d2e..8c8eaa3812 100644 --- a/config/crd/bases/infrastructure.cluster.x-k8s.io_awsmanagedmachinepools.yaml +++ b/config/crd/bases/infrastructure.cluster.x-k8s.io_awsmanagedmachinepools.yaml @@ -230,6 +230,12 @@ spec: description: MaxPrice defines the maximum price the user is willing to pay for Spot VM instances type: string + waitingTimeout: + description: WaitingTimeout defines the maximum time the user + is willing to wait for acquiring a Spot VM instance, after + that it will fallback to a regular on-demand instance. + format: int64 + type: integer type: object sshKeyName: description: |- @@ -847,6 +853,12 @@ spec: description: MaxPrice defines the maximum price the user is willing to pay for Spot VM instances type: string + waitingTimeout: + description: WaitingTimeout defines the maximum time the user + is willing to wait for acquiring a Spot VM instance, after + that it will fallback to a regular on-demand instance. + format: int64 + type: integer type: object sshKeyName: description: |- From 56e58ed8896f17bae6d6817e3bfc422280a9f258 Mon Sep 17 00:00:00 2001 From: Prucek Date: Thu, 18 Jul 2024 14:24:01 +0200 Subject: [PATCH 2/3] ec2: implement spot waiting timeout logic --- api/v1beta2/types.go | 3 - pkg/cloud/services/ec2/instances.go | 26 +++- pkg/cloud/services/ec2/instances_test.go | 145 +++++++++++++++++++++++ 3 files changed, 168 insertions(+), 6 deletions(-) diff --git a/api/v1beta2/types.go b/api/v1beta2/types.go index cb55d1aea9..216fecd80a 100644 --- a/api/v1beta2/types.go +++ b/api/v1beta2/types.go @@ -17,11 +17,8 @@ limitations under the License. package v1beta2 import ( -<<<<<<< HEAD "strings" -======= "time" ->>>>>>> adad1bff1 (api: add WaitingTimeout in SpotMarketOptions) "k8s.io/apimachinery/pkg/util/sets" diff --git a/pkg/cloud/services/ec2/instances.go b/pkg/cloud/services/ec2/instances.go index 0742b0c589..790715925c 100644 --- a/pkg/cloud/services/ec2/instances.go +++ b/pkg/cloud/services/ec2/instances.go @@ -27,6 +27,7 @@ import ( "github.com/aws/aws-sdk-go/service/ec2" "github.com/aws/aws-sdk-go/service/ec2/ec2iface" "github.com/pkg/errors" + "k8s.io/apimachinery/pkg/util/wait" "k8s.io/utils/ptr" infrav1 "sigs.k8s.io/cluster-api-provider-aws/v2/api/v1beta2" @@ -663,9 +664,28 @@ func (s *Service) runInstance(role string, i *infrav1.Instance) (*infrav1.Instan } } - out, err := s.EC2Client.RunInstancesWithContext(context.TODO(), input) - if err != nil { - return nil, errors.Wrap(err, "failed to run instance") + var out *ec2.Reservation + var err error + if i.SpotMarketOptions != nil && i.SpotMarketOptions.WaitingTimeout > 0 { + pollError := wait.PollUntilContextCancel(context.TODO(), i.SpotMarketOptions.WaitingTimeout, true, func(ctx context.Context) (bool, error) { + out, err = s.EC2Client.RunInstancesWithContext(ctx, input) + if err != nil { + return false, errors.Wrap(err, "failed to run instance") + } + return true, nil + }) + if pollError != nil { + input.InstanceMarketOptions = nil + out, err = s.EC2Client.RunInstancesWithContext(context.TODO(), input) + if err != nil { + return nil, errors.Wrap(err, "failed to run instance") + } + } + } else { + out, err = s.EC2Client.RunInstancesWithContext(context.TODO(), input) + if err != nil { + return nil, errors.Wrap(err, "failed to run instance") + } } if len(out.Instances) == 0 { diff --git a/pkg/cloud/services/ec2/instances_test.go b/pkg/cloud/services/ec2/instances_test.go index 51306131a2..3546abfdef 100644 --- a/pkg/cloud/services/ec2/instances_test.go +++ b/pkg/cloud/services/ec2/instances_test.go @@ -21,6 +21,7 @@ import ( "encoding/base64" "strings" "testing" + "time" "github.com/aws/aws-sdk-go/aws" "github.com/aws/aws-sdk-go/aws/awserr" @@ -421,6 +422,137 @@ func TestCreateInstance(t *testing.T) { } }, }, + { + // TODO: (fixme) + // This test is not working as expected. It does not create a spot instance, + // but instead waits for 10s and creates the regular instance. + name: "with spot instance", + machine: &clusterv1.Machine{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{"set": "node"}, + }, + Spec: clusterv1.MachineSpec{ + Bootstrap: clusterv1.Bootstrap{ + DataSecretName: ptr.To("bootstrap-data"), + }, + }, + }, + machineConfig: &infrav1.AWSMachineSpec{ + AMI: infrav1.AMIReference{ + ID: aws.String("abc"), + }, + InstanceType: "m5.large", + SpotMarketOptions: &infrav1.SpotMarketOptions{ + MaxPrice: ptr.To(""), + WaitingTimeout: 10 * time.Second, + }, + }, + awsCluster: &infrav1.AWSCluster{ + ObjectMeta: metav1.ObjectMeta{Name: "test"}, + Spec: infrav1.AWSClusterSpec{ + NetworkSpec: infrav1.NetworkSpec{ + Subnets: infrav1.Subnets{ + infrav1.SubnetSpec{ + ID: "subnet-1", + IsPublic: false, + }, + infrav1.SubnetSpec{ + IsPublic: false, + }, + }, + VPC: infrav1.VPCSpec{ + ID: "vpc-test", + }, + }, + }, + Status: infrav1.AWSClusterStatus{ + Network: infrav1.NetworkStatus{ + SecurityGroups: map[infrav1.SecurityGroupRole]infrav1.SecurityGroup{ + infrav1.SecurityGroupControlPlane: { + ID: "1", + }, + infrav1.SecurityGroupNode: { + ID: "2", + }, + infrav1.SecurityGroupLB: { + ID: "3", + }, + }, + APIServerELB: infrav1.LoadBalancer{ + DNSName: "test-apiserver.us-east-1.aws", + }, + }, + }, + }, + expect: func(m *mocks.MockEC2APIMockRecorder) { + m. + DescribeInstanceTypesWithContext(context.TODO(), gomock.Eq(&ec2.DescribeInstanceTypesInput{ + InstanceTypes: []*string{ + aws.String("m5.large"), + }, + })). + Return(&ec2.DescribeInstanceTypesOutput{ + InstanceTypes: []*ec2.InstanceTypeInfo{ + { + ProcessorInfo: &ec2.ProcessorInfo{ + SupportedArchitectures: []*string{ + aws.String("x86_64"), + }, + }, + SupportedUsageClasses: []*string{ + aws.String("spot"), + }, + }, + }, + }, nil) + m. + RunInstancesWithContext(context.TODO(), gomock.Any()). + Return(&ec2.Reservation{ + Instances: []*ec2.Instance{ + { + State: &ec2.InstanceState{ + Name: aws.String(ec2.InstanceStateNamePending), + }, + IamInstanceProfile: &ec2.IamInstanceProfile{ + Arn: aws.String("arn:aws:iam::123456789012:instance-profile/foo"), + }, + InstanceId: aws.String("two"), + InstanceType: aws.String("m5.large"), + InstanceLifecycle: aws.String("spot"), + SubnetId: aws.String("subnet-1"), + ImageId: aws.String("ami-1"), + RootDeviceName: aws.String("device-1"), + BlockDeviceMappings: []*ec2.InstanceBlockDeviceMapping{ + { + DeviceName: aws.String("device-1"), + Ebs: &ec2.EbsInstanceBlockDevice{ + VolumeId: aws.String("volume-1"), + }, + }, + }, + Placement: &ec2.Placement{ + AvailabilityZone: &az, + }, + SpotInstanceRequestId: aws.String("spot-1"), + }, + }, + }, nil) + m. + DescribeNetworkInterfacesWithContext(context.TODO(), gomock.Any()). + Return(&ec2.DescribeNetworkInterfacesOutput{ + NetworkInterfaces: []*ec2.NetworkInterface{}, + NextToken: nil, + }, nil) + }, + check: func(instance *infrav1.Instance, err error) { + if instance.SpotMarketOptions == nil { + t.Logf("SpotMarketOptions is nil, but should not be") + } + if err != nil { + t.Fatalf("did not expect error: %v", err) + } + }, + }, { name: "with availability zone", machine: &clusterv1.Machine{ @@ -5313,6 +5445,19 @@ func TestGetInstanceMarketOptionsRequest(t *testing.T) { }, }, }, + { + name: "with WaitingTimeout specified", + spotMarketOptions: &infrav1.SpotMarketOptions{ + WaitingTimeout: 10 * time.Minute, + }, + expectedRequest: &ec2.InstanceMarketOptionsRequest{ + MarketType: aws.String(ec2.MarketTypeSpot), + SpotOptions: &ec2.SpotMarketOptions{ + InstanceInterruptionBehavior: aws.String(ec2.InstanceInterruptionBehaviorTerminate), + SpotInstanceType: aws.String(ec2.SpotInstanceTypeOneTime), + }, + }, + }, } for _, tc := range testCases { From 433ce71f39056f369a27fd1e2f5054b12a0a14f6 Mon Sep 17 00:00:00 2001 From: Prucek Date: Thu, 18 Jul 2024 14:25:11 +0200 Subject: [PATCH 3/3] docs: update spot with WaitingTimeout --- docs/book/src/topics/spot-instances.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/docs/book/src/topics/spot-instances.md b/docs/book/src/topics/spot-instances.md index 5030dbd7ad..614493660b 100644 --- a/docs/book/src/topics/spot-instances.md +++ b/docs/book/src/topics/spot-instances.md @@ -25,6 +25,7 @@ spec: instanceType: ${AWS_NODE_MACHINE_TYPE} spotMarketOptions: maxPrice: "" + waitingTimeout: "" sshKeyName: ${AWS_SSH_KEY_NAME} ``` Users may also add a `maxPrice` to the options to limit the maximum spend for the instance. It is however, recommended not to set a maxPrice as AWS will cap your spending at the on-demand price if this field is left empty, and you will experience fewer interruptions. @@ -35,6 +36,14 @@ spec: maxPrice: 0.02 # Price in USD per hour (up to 5 decimal places) ``` +Users may also add a `waitingTimeout` to the options to limit waiting time for the Spot instance. After exceeding this timeout a regular ec2 instance will be acquired. +```yaml +spec: + template: + spotMarketOptions: + waitingTimeout: "10m" # Time in minutes +``` + ## Using Spot Instances with AWSManagedMachinePool To use spot instance in EKS managed node groups for a EKS cluster, set `capacityType` to `spot` in `AWSManagedMachinePool`. ```yaml