Skip to content

Commit 8d64bc6

Browse files
authored
Merge pull request #4092 from cnmcavoy/cnmcavoy/speedy-provision-and-termination
Modify AWSMachine reconciliation behavior to terminate and create instances without blocking
2 parents 64827bf + f5d1d45 commit 8d64bc6

File tree

7 files changed

+40
-94
lines changed

7 files changed

+40
-94
lines changed

controllers/awscluster_controller_test.go

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -275,7 +275,7 @@ func TestAWSClusterReconcilerIntegrationTests(t *testing.T) {
275275
mockedDeleteVPCCallsForNonExistentVPC(m)
276276
mockedDeleteLBCalls(true, ev2, e)
277277
mockedDescribeInstanceCall(m)
278-
mockedDeleteInstanceCalls(m)
278+
mockedDeleteInstanceAndAwaitTerminationCalls(m)
279279
}
280280
expect(ec2Mock.EXPECT(), elbv2Mock.EXPECT(), elbMock.EXPECT())
281281

@@ -347,7 +347,7 @@ func TestAWSClusterReconcilerIntegrationTests(t *testing.T) {
347347
mockedDeleteVPCCalls(m)
348348
mockedDescribeInstanceCall(m)
349349
mockedDeleteLBCalls(true, ev2, e)
350-
mockedDeleteInstanceCalls(m)
350+
mockedDeleteInstanceAndAwaitTerminationCalls(m)
351351
mockedDeleteSGCalls(m)
352352
}
353353
expect(ec2Mock.EXPECT(), elbv2Mock.EXPECT(), elbMock.EXPECT())
@@ -497,19 +497,25 @@ func mockedDescribeInstanceCall(m *mocks.MockEC2APIMockRecorder) {
497497
}, nil)
498498
}
499499

500-
func mockedDeleteInstanceCalls(m *mocks.MockEC2APIMockRecorder) {
500+
func mockedDeleteInstanceAndAwaitTerminationCalls(m *mocks.MockEC2APIMockRecorder) {
501501
m.TerminateInstances(
502502
gomock.Eq(&ec2.TerminateInstancesInput{
503503
InstanceIds: aws.StringSlice([]string{"id-1"}),
504504
}),
505-
).
506-
Return(nil, nil)
505+
).Return(nil, nil)
507506
m.WaitUntilInstanceTerminated(
508507
gomock.Eq(&ec2.DescribeInstancesInput{
509508
InstanceIds: aws.StringSlice([]string{"id-1"}),
510509
}),
511-
).
512-
Return(nil)
510+
).Return(nil)
511+
}
512+
513+
func mockedDeleteInstanceCalls(m *mocks.MockEC2APIMockRecorder) {
514+
m.TerminateInstances(
515+
gomock.Eq(&ec2.TerminateInstancesInput{
516+
InstanceIds: aws.StringSlice([]string{"id-1"}),
517+
}),
518+
).Return(nil, nil)
513519
}
514520

515521
func mockedCreateVPCCalls(m *mocks.MockEC2APIMockRecorder) {

controllers/awsmachine_controller.go

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import (
2020
"context"
2121
"encoding/json"
2222
"fmt"
23+
"time"
2324

2425
"github.com/aws/aws-sdk-go/aws"
2526
ignTypes "github.com/flatcar/ignition/config/v2_3/types"
@@ -340,8 +341,14 @@ func (r *AWSMachineReconciler) reconcileDelete(machineScope *scope.MachineScope,
340341
// This decision is based on the ec2-instance-lifecycle graph at
341342
// https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-instance-lifecycle.html
342343
switch instance.State {
343-
case infrav1.InstanceStateShuttingDown, infrav1.InstanceStateTerminated:
344+
case infrav1.InstanceStateShuttingDown:
344345
machineScope.Info("EC2 instance is shutting down or already terminated", "instance-id", instance.ID)
346+
// requeue reconciliation until we observe termination (or the instance can no longer be looked up)
347+
return ctrl.Result{RequeueAfter: time.Minute}, nil
348+
case infrav1.InstanceStateTerminated:
349+
machineScope.Info("EC2 instance terminated successfully", "instance-id", instance.ID)
350+
controllerutil.RemoveFinalizer(machineScope.AWSMachine, infrav1.MachineFinalizer)
351+
return ctrl.Result{}, nil
345352
default:
346353
machineScope.Info("Terminating EC2 instance", "instance-id", instance.ID)
347354

@@ -352,7 +359,7 @@ func (r *AWSMachineReconciler) reconcileDelete(machineScope *scope.MachineScope,
352359
return ctrl.Result{}, err
353360
}
354361

355-
if err := ec2Service.TerminateInstanceAndWait(instance.ID); err != nil {
362+
if err := ec2Service.TerminateInstance(instance.ID); err != nil {
356363
machineScope.Error(err, "failed to terminate instance")
357364
conditions.MarkFalse(machineScope.AWSMachine, infrav1.InstanceReadyCondition, "DeletingFailed", clusterv1.ConditionSeverityWarning, err.Error())
358365
r.Recorder.Eventf(machineScope.AWSMachine, corev1.EventTypeWarning, "FailedTerminate", "Failed to terminate instance %q: %v", instance.ID, err)
@@ -391,12 +398,10 @@ func (r *AWSMachineReconciler) reconcileDelete(machineScope *scope.MachineScope,
391398

392399
machineScope.Info("EC2 instance successfully terminated", "instance-id", instance.ID)
393400
r.Recorder.Eventf(machineScope.AWSMachine, corev1.EventTypeNormal, "SuccessfulTerminate", "Terminated instance %q", instance.ID)
394-
}
395401

396-
// Instance is deleted so remove the finalizer.
397-
controllerutil.RemoveFinalizer(machineScope.AWSMachine, infrav1.MachineFinalizer)
398-
399-
return ctrl.Result{}, nil
402+
// requeue reconciliation until we observe termination (or the instance can no longer be looked up)
403+
return ctrl.Result{RequeueAfter: time.Minute}, nil
404+
}
400405
}
401406

402407
// findInstance queries the EC2 apis and retrieves the instance if it exists.

controllers/awsmachine_controller_test.go

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -581,8 +581,6 @@ func mockedCreateInstanceCalls(m *mocks.MockEC2APIMockRecorder) {
581581
},
582582
},
583583
}, nil)
584-
m.WaitUntilInstanceRunningWithContext(gomock.Any(), gomock.Any(), gomock.Any()).
585-
Return(nil)
586584
m.DescribeNetworkInterfaces(gomock.Eq(&ec2.DescribeNetworkInterfacesInput{Filters: []*ec2.Filter{
587585
{
588586
Name: aws.String("attachment.instance-id"),

controllers/awsmachine_controller_unit_test.go

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -391,6 +391,7 @@ func TestAWSMachineReconciler(t *testing.T) {
391391
secretSvc.EXPECT().UserData(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(nil, nil).Times(1)
392392
instance.State = infrav1.InstanceStatePending
393393
_, _ = reconciler.reconcileNormal(context.Background(), ms, cs, cs, cs, cs)
394+
394395
g.Expect(ms.AWSMachine.Status.InstanceState).To(PointTo(Equal(infrav1.InstanceStatePending)))
395396
g.Expect(ms.AWSMachine.Status.Ready).To(Equal(false))
396397
g.Expect(buf.String()).To(ContainSubstring(("EC2 instance state changed")))
@@ -410,6 +411,7 @@ func TestAWSMachineReconciler(t *testing.T) {
410411
secretSvc.EXPECT().UserData(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(nil, nil).Times(1)
411412
instance.State = infrav1.InstanceStateRunning
412413
_, _ = reconciler.reconcileNormal(context.Background(), ms, cs, cs, cs, cs)
414+
413415
g.Expect(ms.AWSMachine.Status.InstanceState).To(PointTo(Equal(infrav1.InstanceStateRunning)))
414416
g.Expect(ms.AWSMachine.Status.Ready).To(Equal(true))
415417
g.Expect(buf.String()).To(ContainSubstring(("EC2 instance state changed")))
@@ -1081,7 +1083,7 @@ func TestAWSMachineReconciler(t *testing.T) {
10811083

10821084
instance.State = infrav1.InstanceStateRunning
10831085
secretSvc.EXPECT().Delete(gomock.Any()).Return(nil).Times(1)
1084-
ec2Svc.EXPECT().TerminateInstanceAndWait(gomock.Any()).Return(nil).AnyTimes()
1086+
ec2Svc.EXPECT().TerminateInstance(gomock.Any()).Return(nil).AnyTimes()
10851087
_, _ = reconciler.reconcileDelete(ms, cs, cs, cs, cs)
10861088
})
10871089

@@ -1094,7 +1096,7 @@ func TestAWSMachineReconciler(t *testing.T) {
10941096

10951097
ms.AWSMachine.Status.FailureReason = capierrors.MachineStatusErrorPtr(capierrors.UpdateMachineError)
10961098
secretSvc.EXPECT().Delete(gomock.Any()).Return(nil).Times(1)
1097-
ec2Svc.EXPECT().TerminateInstanceAndWait(gomock.Any()).Return(nil).AnyTimes()
1099+
ec2Svc.EXPECT().TerminateInstance(gomock.Any()).Return(nil).AnyTimes()
10981100
_, _ = reconciler.reconcileDelete(ms, cs, cs, cs, cs)
10991101
})
11001102
t.Run("should not attempt to delete the secret if InsecureSkipSecretsManager is set on CloudInit", func(t *testing.T) {
@@ -1107,7 +1109,7 @@ func TestAWSMachineReconciler(t *testing.T) {
11071109
ms.AWSMachine.Spec.CloudInit.InsecureSkipSecretsManager = true
11081110

11091111
secretSvc.EXPECT().Delete(gomock.Any()).Return(nil).Times(0)
1110-
ec2Svc.EXPECT().TerminateInstanceAndWait(gomock.Any()).Return(nil).AnyTimes()
1112+
ec2Svc.EXPECT().TerminateInstance(gomock.Any()).Return(nil).AnyTimes()
11111113

11121114
_, _ = reconciler.reconcileDelete(ms, cs, cs, cs, cs)
11131115
})
@@ -1167,7 +1169,7 @@ func TestAWSMachineReconciler(t *testing.T) {
11671169

11681170
instance.State = infrav1.InstanceStateRunning
11691171
secretSvc.EXPECT().Delete(gomock.Any()).Return(nil).Times(1)
1170-
ec2Svc.EXPECT().TerminateInstanceAndWait(gomock.Any()).Return(nil).AnyTimes()
1172+
ec2Svc.EXPECT().TerminateInstance(gomock.Any()).Return(nil).AnyTimes()
11711173
_, _ = reconciler.reconcileDelete(ms, cs, cs, cs, cs)
11721174
})
11731175

@@ -1180,7 +1182,7 @@ func TestAWSMachineReconciler(t *testing.T) {
11801182

11811183
ms.AWSMachine.Status.FailureReason = capierrors.MachineStatusErrorPtr(capierrors.UpdateMachineError)
11821184
secretSvc.EXPECT().Delete(gomock.Any()).Return(nil).Times(1)
1183-
ec2Svc.EXPECT().TerminateInstanceAndWait(gomock.Any()).Return(nil).AnyTimes()
1185+
ec2Svc.EXPECT().TerminateInstance(gomock.Any()).Return(nil).AnyTimes()
11841186
_, _ = reconciler.reconcileDelete(ms, cs, cs, cs, cs)
11851187
})
11861188
})
@@ -1348,7 +1350,7 @@ func TestAWSMachineReconciler(t *testing.T) {
13481350

13491351
instance.State = infrav1.InstanceStateRunning
13501352
objectStoreSvc.EXPECT().Delete(gomock.Any()).Return(nil).Times(1)
1351-
ec2Svc.EXPECT().TerminateInstanceAndWait(gomock.Any()).Return(nil).AnyTimes()
1353+
ec2Svc.EXPECT().TerminateInstance(gomock.Any()).Return(nil).AnyTimes()
13521354

13531355
_, _ = reconciler.reconcileDelete(ms, cs, cs, cs, cs)
13541356
})
@@ -1365,7 +1367,7 @@ func TestAWSMachineReconciler(t *testing.T) {
13651367
ms.AWSMachine.Status.FailureReason = capierrors.MachineStatusErrorPtr(capierrors.UpdateMachineError)
13661368

13671369
objectStoreSvc.EXPECT().Delete(gomock.Any()).Return(nil).Times(1)
1368-
ec2Svc.EXPECT().TerminateInstanceAndWait(gomock.Any()).Return(nil).AnyTimes()
1370+
ec2Svc.EXPECT().TerminateInstance(gomock.Any()).Return(nil).AnyTimes()
13691371

13701372
_, _ = reconciler.reconcileDelete(ms, cs, cs, cs, cs)
13711373
})
@@ -1429,7 +1431,7 @@ func TestAWSMachineReconciler(t *testing.T) {
14291431

14301432
instance.State = infrav1.InstanceStateRunning
14311433
objectStoreSvc.EXPECT().Delete(gomock.Any()).Return(nil).Times(1)
1432-
ec2Svc.EXPECT().TerminateInstanceAndWait(gomock.Any()).Return(nil).AnyTimes()
1434+
ec2Svc.EXPECT().TerminateInstance(gomock.Any()).Return(nil).AnyTimes()
14331435
_, _ = reconciler.reconcileDelete(ms, cs, cs, cs, cs)
14341436
})
14351437

@@ -1444,7 +1446,7 @@ func TestAWSMachineReconciler(t *testing.T) {
14441446
// TODO: This seems to have no effect on the test result.
14451447
ms.AWSMachine.Status.FailureReason = capierrors.MachineStatusErrorPtr(capierrors.UpdateMachineError)
14461448
objectStoreSvc.EXPECT().Delete(gomock.Any()).Return(nil).Times(1)
1447-
ec2Svc.EXPECT().TerminateInstanceAndWait(gomock.Any()).Return(nil).AnyTimes()
1449+
ec2Svc.EXPECT().TerminateInstance(gomock.Any()).Return(nil).AnyTimes()
14481450
_, _ = reconciler.reconcileDelete(ms, cs, cs, cs, cs)
14491451
})
14501452
})
@@ -1534,9 +1536,8 @@ func TestAWSMachineReconciler(t *testing.T) {
15341536
_, err := reconciler.reconcileDelete(ms, cs, cs, cs, cs)
15351537
g.Expect(err).To(BeNil())
15361538
g.Expect(buf.String()).To(ContainSubstring("EC2 instance is shutting down or already terminated"))
1537-
g.Expect(ms.AWSMachine.Finalizers).To(ConsistOf(metav1.FinalizerDeleteDependents))
15381539
})
1539-
t.Run("should ignore instances in terminated down state", func(t *testing.T) {
1540+
t.Run("should ignore instances in terminated state", func(t *testing.T) {
15401541
g := NewWithT(t)
15411542
awsMachine := getAWSMachine()
15421543
setup(t, g, awsMachine)
@@ -1553,7 +1554,7 @@ func TestAWSMachineReconciler(t *testing.T) {
15531554

15541555
_, err := reconciler.reconcileDelete(ms, cs, cs, cs, cs)
15551556
g.Expect(err).To(BeNil())
1556-
g.Expect(buf.String()).To(ContainSubstring("EC2 instance is shutting down or already terminated"))
1557+
g.Expect(buf.String()).To(ContainSubstring("EC2 instance terminated successfully"))
15571558
g.Expect(ms.AWSMachine.Finalizers).To(ConsistOf(metav1.FinalizerDeleteDependents))
15581559
})
15591560
t.Run("instance not shutting down yet", func(t *testing.T) {
@@ -1572,7 +1573,7 @@ func TestAWSMachineReconciler(t *testing.T) {
15721573
getRunningInstance(t, g)
15731574

15741575
expected := errors.New("can't reach AWS to terminate machine")
1575-
ec2Svc.EXPECT().TerminateInstanceAndWait(gomock.Any()).Return(expected)
1576+
ec2Svc.EXPECT().TerminateInstance(gomock.Any()).Return(expected)
15761577

15771578
buf := new(bytes.Buffer)
15781579
klog.SetOutput(buf)
@@ -1585,7 +1586,7 @@ func TestAWSMachineReconciler(t *testing.T) {
15851586
t.Run("when instance can be shut down", func(t *testing.T) {
15861587
terminateInstance := func(t *testing.T, g *WithT) {
15871588
t.Helper()
1588-
ec2Svc.EXPECT().TerminateInstanceAndWait(gomock.Any()).Return(nil)
1589+
ec2Svc.EXPECT().TerminateInstance(gomock.Any()).Return(nil)
15891590
secretSvc.EXPECT().Delete(gomock.Any()).Return(nil).AnyTimes()
15901591
}
15911592

@@ -1663,7 +1664,6 @@ func TestAWSMachineReconciler(t *testing.T) {
16631664

16641665
_, err := reconciler.reconcileDelete(ms, cs, cs, cs, cs)
16651666
g.Expect(err).To(BeNil())
1666-
g.Expect(ms.AWSMachine.Finalizers).To(ConsistOf(metav1.FinalizerDeleteDependents))
16671667
})
16681668

16691669
t.Run("should fail to detach control plane ELB from instance", func(t *testing.T) {

pkg/cloud/services/ec2/bastion_test.go

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -366,8 +366,6 @@ func TestServiceReconcileBastion(t *testing.T) {
366366
},
367367
},
368368
}, nil)
369-
m.WaitUntilInstanceRunningWithContext(gomock.Any(), gomock.Any(), gomock.Any()).
370-
Return(nil)
371369
},
372370
bastionEnabled: true,
373371
expectError: false,

pkg/cloud/services/ec2/instances.go

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -17,15 +17,12 @@ limitations under the License.
1717
package ec2
1818

1919
import (
20-
"context"
2120
"encoding/base64"
2221
"fmt"
2322
"sort"
2423
"strings"
25-
"time"
2624

2725
"github.com/aws/aws-sdk-go/aws"
28-
"github.com/aws/aws-sdk-go/aws/request"
2926
"github.com/aws/aws-sdk-go/service/ec2"
3027
"github.com/pkg/errors"
3128
"k8s.io/utils/pointer"
@@ -34,7 +31,6 @@ import (
3431
"sigs.k8s.io/cluster-api-provider-aws/v2/pkg/cloud/awserrors"
3532
"sigs.k8s.io/cluster-api-provider-aws/v2/pkg/cloud/converters"
3633
"sigs.k8s.io/cluster-api-provider-aws/v2/pkg/cloud/filter"
37-
awslogs "sigs.k8s.io/cluster-api-provider-aws/v2/pkg/cloud/logs"
3834
"sigs.k8s.io/cluster-api-provider-aws/v2/pkg/cloud/scope"
3935
"sigs.k8s.io/cluster-api-provider-aws/v2/pkg/cloud/services/userdata"
4036
"sigs.k8s.io/cluster-api-provider-aws/v2/pkg/record"
@@ -591,19 +587,6 @@ func (s *Service) runInstance(role string, i *infrav1.Instance) (*infrav1.Instan
591587
return nil, errors.Errorf("no instance returned for reservation %v", out.GoString())
592588
}
593589

594-
waitTimeout := 1 * time.Minute
595-
s.scope.Debug("Waiting for instance to be in running state", "instance-id", *out.Instances[0].InstanceId, "timeout", waitTimeout.String())
596-
ctx, cancel := context.WithTimeout(aws.BackgroundContext(), waitTimeout)
597-
defer cancel()
598-
599-
if err := s.EC2Client.WaitUntilInstanceRunningWithContext(
600-
ctx,
601-
&ec2.DescribeInstancesInput{InstanceIds: []*string{out.Instances[0].InstanceId}},
602-
request.WithWaiterLogger(awslogs.NewWrapLogr(s.scope.GetLogger())),
603-
); err != nil {
604-
s.scope.Debug("Could not determine if Machine is running. Machine state might be unavailable until next renconciliation.")
605-
}
606-
607590
return s.SDKToInstance(out.Instances[0])
608591
}
609592

0 commit comments

Comments
 (0)