Skip to content

Commit 8d2353e

Browse files
authored
Add AWS EC2 Spot functionality to Escalator (#188)
* Add AWS EC2 Spot functionality to Escalator * Fixes for the linter
1 parent 85fd5fd commit 8d2353e

File tree

9 files changed

+514
-38
lines changed

9 files changed

+514
-38
lines changed

cmd/main.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,8 @@ func setupCloudProvider(nodegroups []controller.NodeGroupOptions) cloudprovider.
7575
LaunchTemplateID: n.AWS.LaunchTemplateID,
7676
LaunchTemplateVersion: n.AWS.LaunchTemplateVersion,
7777
FleetInstanceReadyTimeout: n.AWS.FleetInstanceReadyTimeoutDuration(),
78+
Lifecycle: n.AWS.Lifecycle,
79+
InstanceTypeOverrides: n.AWS.InstanceTypeOverrides,
7880
},
7981
})
8082
}

docs/configuration/nodegroup.md

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@ node_groups:
3030
fleet_instance_ready_timeout: 1m
3131
launch_template_version: lt-1a2b3c4d
3232
launch_template_id: "1"
33+
lifecycle: on-demand
34+
instance_type_overrides: ["t2.large", "t3.large"]
3335
```
3436
3537
## Options
@@ -224,3 +226,19 @@ numeric string. This value can be obtained through the AWS EC2 console on the La
224226
`LatestVersionNumber` or `DefaultVersionNumber` field returned from the
225227
[create-launch-template](https://docs.aws.amazon.com/cli/latest/reference/ec2/create-launch-template.html) CLI command
226228
and AWS API call.
229+
230+
### `aws.lifecyle`
231+
232+
Dependent on Launch Template ID being specified.
233+
234+
This optional value is the lifecycle of the instances that will be launched. The accepted values are strings of either
235+
`on-demand` or `spot` to request On-Demand or Spot instances respectively. If no value is specified this will default
236+
to `on-demand`.
237+
238+
### `aws.instance_type_overrides`
239+
240+
Dependent on Launch Template ID being specified.
241+
242+
An optional list of instance types to override the instance type within the launch template. Providing multiple instance
243+
types here increases the likelihood of a Spot request being successful. If omitted the instance type to request will
244+
be taken from the launch template.

docs/deployment/aws/README.md

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,13 @@ Escalator requires the following IAM policy to be able to properly integrate wit
1919
"Effect": "Allow",
2020
"Action": [
2121
"autoscaling:DescribeAutoScalingGroups",
22+
"autoscaling:DescribeAutoScalingInstances",
23+
"autoscaling:DescribeLaunchConfigurations",
24+
"autoscaling:DescribeTags",
2225
"autoscaling:SetDesiredCapacity",
2326
"autoscaling:TerminateInstanceInAutoScalingGroup",
24-
"ec2:DescribeInstances"
27+
"ec2:DescribeInstances",
28+
"ec2:DescribeLaunchTemplateVersions"
2529
],
2630
"Resource": "*"
2731
}
@@ -99,3 +103,5 @@ region.
99103
- Do not use
100104
[Auto Scaling Lifecycle Hooks](https://docs.aws.amazon.com/autoscaling/ec2/userguide/lifecycle-hooks.html) for
101105
terminating of instances as Escalator will handle the termination of instances itself.
106+
- If using launch templates do not use the "network settings" area to configure the security groups. The security groups
107+
should be configured via a network interface.

pkg/cloudprovider/aws/aws.go

Lines changed: 126 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,16 @@ import (
1717
v1 "k8s.io/api/core/v1"
1818
)
1919

20-
// ProviderName identifies this module as aws
21-
const ProviderName = "aws"
20+
const (
21+
// ProviderName identifies this module as aws
22+
ProviderName = "aws"
23+
// LifecycleOnDemand string constant for On-Demand EC2 instances
24+
LifecycleOnDemand = "on-demand"
25+
// LifecycleSpot string constant for Spot EC2 instances
26+
LifecycleSpot = "spot"
27+
// The AttachInstances API only supports adding 20 instances at a time
28+
batchSize = 20
29+
)
2230

2331
func instanceToProviderID(instance *autoscaling.Instance) string {
2432
return fmt.Sprintf("aws:///%s/%s", *instance.AvailabilityZone, *instance.InstanceId)
@@ -236,7 +244,7 @@ func (n *NodeGroup) IncreaseSize(delta int64) error {
236244
return n.setASGDesiredSizeOneShot(delta)
237245
}
238246

239-
log.WithField("asg", n.id).Infof("Scaling with SetDesiredCapacity trategy")
247+
log.WithField("asg", n.id).Infof("Scaling with SetDesiredCapacity strategy")
240248
return n.setASGDesiredSize(n.TargetSize() + delta)
241249

242250
}
@@ -343,36 +351,27 @@ func (n *NodeGroup) setASGDesiredSize(newSize int64) error {
343351
// setASGDesiredSizeOneShot uses the AWS fleet API to acquire all desired
344352
// capacity in one step and then add it to the existing auto-scaling group.
345353
func (n *NodeGroup) setASGDesiredSizeOneShot(addCount int64) error {
346-
fleet, err := n.provider.ec2Service.CreateFleet(&ec2.CreateFleetInput{
347-
Type: awsapi.String("instant"),
348-
TerminateInstancesWithExpiration: awsapi.Bool(false),
349-
OnDemandOptions: &ec2.OnDemandOptionsRequest{
350-
MinTargetCapacity: awsapi.Int64(addCount),
351-
SingleInstanceType: awsapi.Bool(true),
352-
},
353-
TargetCapacitySpecification: &ec2.TargetCapacitySpecificationRequest{
354-
OnDemandTargetCapacity: awsapi.Int64(addCount),
355-
TotalTargetCapacity: awsapi.Int64(addCount),
356-
DefaultTargetCapacityType: awsapi.String("on-demand"),
357-
},
358-
LaunchTemplateConfigs: []*ec2.FleetLaunchTemplateConfigRequest{
359-
{
360-
LaunchTemplateSpecification: &ec2.FleetLaunchTemplateSpecificationRequest{
361-
LaunchTemplateId: awsapi.String(n.config.AWSConfig.LaunchTemplateID),
362-
Version: awsapi.String(n.config.AWSConfig.LaunchTemplateVersion),
363-
},
364-
},
365-
},
366-
})
354+
// Parse the Escalator args into the correct format for a CreateFleet request, then make the request.
355+
fleetInput, err := createFleetInput(*n, addCount)
367356
if err != nil {
357+
log.Error("Failed setup for CreateFleet call.")
368358
return err
369359
}
370360

371-
// This will hold any launch errors for the fleet. In the case of an
372-
// instant fleet with a single instant type this will indicate that the
373-
// entire fleet failed to launch.
374-
for _, lerr := range fleet.Errors {
375-
return errors.New(*lerr.ErrorMessage)
361+
fleet, err := n.provider.ec2Service.CreateFleet(fleetInput)
362+
if err != nil {
363+
log.Errorf("Failed CreateFleet call. CreateFleetInput: %v", fleetInput)
364+
return err
365+
}
366+
367+
// CreateFleet returns an array of errors with the response. Sometimes errors are present even when instances were
368+
// successfully provisioned. In this case, the min target capacity is the size of the full request, so if any
369+
// instances are present this indicates we got them all and can ignore the errors.
370+
if len(fleet.Instances) == 0 && len(fleet.Errors) > 0 {
371+
for _, err := range fleet.Errors {
372+
log.Error(*err.ErrorMessage)
373+
}
374+
return errors.New(*fleet.Errors[0].ErrorMessage)
376375
}
377376

378377
instances := make([]*string, 0)
@@ -402,8 +401,6 @@ InstanceReadyLoop:
402401
}
403402
}
404403

405-
// The AttachInstances API only supports adding 20 instances at a time
406-
batchSize := 20
407404
var batch []*string
408405
for batchSize < len(instances) {
409406
instances, batch = instances[batchSize:], instances[0:batchSize:batchSize]
@@ -413,6 +410,7 @@ InstanceReadyLoop:
413410
InstanceIds: batch,
414411
})
415412
if err != nil {
413+
log.Error("Failed AttachInstances call.")
416414
return err
417415
}
418416
}
@@ -426,7 +424,12 @@ InstanceReadyLoop:
426424

427425
log.WithField("asg", n.id).Debugf("CurrentSize: %v", n.Size())
428426
log.WithField("asg", n.id).Debugf("CurrentTargetSize: %v", n.TargetSize())
429-
return err
427+
if err != nil {
428+
log.Error("Failed AttachInstances call.")
429+
return err
430+
}
431+
432+
return nil
430433
}
431434

432435
func (n *NodeGroup) allInstancesReady(ids []*string) bool {
@@ -453,3 +456,94 @@ func (n *NodeGroup) allInstancesReady(ids []*string) bool {
453456

454457
return ready
455458
}
459+
460+
// createFleetInput will parse Escalator input into the format needed for a CreateFleet request.
461+
func createFleetInput(n NodeGroup, addCount int64) (*ec2.CreateFleetInput, error) {
462+
lifecycle := n.config.AWSConfig.Lifecycle
463+
if lifecycle == "" {
464+
lifecycle = LifecycleOnDemand
465+
}
466+
467+
launchTemplateOverrides, err := createTemplateOverrides(n)
468+
if err != nil {
469+
return nil, err
470+
}
471+
472+
fleetInput := &ec2.CreateFleetInput{
473+
Type: awsapi.String("instant"),
474+
TerminateInstancesWithExpiration: awsapi.Bool(false),
475+
TargetCapacitySpecification: &ec2.TargetCapacitySpecificationRequest{
476+
TotalTargetCapacity: awsapi.Int64(addCount),
477+
DefaultTargetCapacityType: awsapi.String(lifecycle),
478+
},
479+
LaunchTemplateConfigs: []*ec2.FleetLaunchTemplateConfigRequest{
480+
{
481+
LaunchTemplateSpecification: &ec2.FleetLaunchTemplateSpecificationRequest{
482+
LaunchTemplateId: awsapi.String(n.config.AWSConfig.LaunchTemplateID),
483+
Version: awsapi.String(n.config.AWSConfig.LaunchTemplateVersion),
484+
},
485+
Overrides: launchTemplateOverrides,
486+
},
487+
},
488+
}
489+
490+
if lifecycle == LifecycleOnDemand {
491+
fleetInput.OnDemandOptions = &ec2.OnDemandOptionsRequest{
492+
MinTargetCapacity: awsapi.Int64(addCount),
493+
SingleInstanceType: awsapi.Bool(true),
494+
}
495+
} else {
496+
fleetInput.SpotOptions = &ec2.SpotOptionsRequest{
497+
MinTargetCapacity: awsapi.Int64(addCount),
498+
SingleInstanceType: awsapi.Bool(true),
499+
}
500+
}
501+
502+
return fleetInput, nil
503+
}
504+
505+
// createTemplateOverrides will parse the overrides into the FleetLaunchTemplateOverridesRequest format
506+
func createTemplateOverrides(n NodeGroup) ([]*ec2.FleetLaunchTemplateOverridesRequest, error) {
507+
// Get subnetIDs from the ASG
508+
describeASGOutput, err := n.provider.service.DescribeAutoScalingGroups(&autoscaling.DescribeAutoScalingGroupsInput{
509+
AutoScalingGroupNames: []*string{
510+
awsapi.String(n.id),
511+
},
512+
})
513+
if err != nil {
514+
log.Errorf("Failed call to DescribeAutoScalingGroups for ASG %v", n.id)
515+
return nil, err
516+
}
517+
if len(describeASGOutput.AutoScalingGroups) == 0 {
518+
return nil, errors.New("failed to get an ASG from DescribeAutoscalingGroups response")
519+
}
520+
if *describeASGOutput.AutoScalingGroups[0].VPCZoneIdentifier == "" {
521+
return nil, errors.New("failed to get any subnetIDs from DescribeAutoscalingGroups response")
522+
}
523+
vpcZoneIdentifier := describeASGOutput.AutoScalingGroups[0].VPCZoneIdentifier
524+
subnetIDs := strings.Split(*vpcZoneIdentifier, ",")
525+
526+
instanceTypes := n.config.AWSConfig.InstanceTypeOverrides
527+
528+
var launchTemplateOverrides []*ec2.FleetLaunchTemplateOverridesRequest
529+
if len(instanceTypes) > 0 {
530+
for i := range subnetIDs {
531+
for j := range instanceTypes {
532+
overridesRequest := ec2.FleetLaunchTemplateOverridesRequest{
533+
SubnetId: &subnetIDs[i],
534+
InstanceType: &instanceTypes[j],
535+
}
536+
launchTemplateOverrides = append(launchTemplateOverrides, &overridesRequest)
537+
}
538+
}
539+
} else {
540+
for i := range subnetIDs {
541+
overridesRequest := ec2.FleetLaunchTemplateOverridesRequest{
542+
SubnetId: &subnetIDs[i],
543+
}
544+
launchTemplateOverrides = append(launchTemplateOverrides, &overridesRequest)
545+
}
546+
}
547+
548+
return launchTemplateOverrides, nil
549+
}

0 commit comments

Comments
 (0)