Skip to content

Commit eae1579

Browse files
JoelSpeedfrobware
authored andcommitted
Ensure DeleteNodes doesn't delete a node twice
1 parent 699c0b8 commit eae1579

File tree

2 files changed

+73
-13
lines changed

2 files changed

+73
-13
lines changed

cluster-autoscaler/cloudprovider/clusterapi/clusterapi_nodegroup.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,10 @@ func (ng *nodegroup) DeleteNodes(nodes []*corev1.Node) error {
8989
return nil
9090
}
9191

92+
if actualNodeGroup == nil {
93+
return fmt.Errorf("no node group found for node %q", node.Spec.ProviderID)
94+
}
95+
9296
if actualNodeGroup.Id() != ng.Id() {
9397
return fmt.Errorf("node %q doesn't belong to node group %q", node.Spec.ProviderID, ng.Id())
9498
}
@@ -117,6 +121,11 @@ func (ng *nodegroup) DeleteNodes(nodes []*corev1.Node) error {
117121

118122
machine = machine.DeepCopy()
119123

124+
if !machine.GetDeletionTimestamp().IsZero() {
125+
// The machine for this node is already being deleted
126+
continue
127+
}
128+
120129
if machine.Annotations == nil {
121130
machine.Annotations = map[string]string{}
122131
}

cluster-autoscaler/cloudprovider/clusterapi/clusterapi_nodegroup_test.go

Lines changed: 64 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ import (
2222
"sort"
2323
"strings"
2424
"testing"
25+
"time"
2526

2627
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2728
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
@@ -428,22 +429,12 @@ func TestNodeGroupDecreaseTargetSize(t *testing.T) {
428429
switch v := (ng.scalableResource).(type) {
429430
case *machineSetScalableResource:
430431
testConfig.machineSet.Spec.Replicas = int32ptr(*testConfig.machineSet.Spec.Replicas + tc.targetSizeIncrement)
431-
ms, err := ng.machineapiClient.MachineSets(ng.Namespace()).Update(testConfig.machineSet)
432-
if err != nil {
433-
t.Fatalf("unexpected error: %v", err)
434-
}
435-
436-
if err := controller.machineSetInformer.Informer().GetStore().Add(ms); err != nil {
432+
if err := controller.machineSetInformer.Informer().GetStore().Add(newUnstructuredFromMachineSet(testConfig.machineSet)); err != nil {
437433
t.Fatalf("failed to add new machine: %v", err)
438434
}
439435
case *machineDeploymentScalableResource:
440436
testConfig.machineDeployment.Spec.Replicas = int32ptr(*testConfig.machineDeployment.Spec.Replicas + tc.targetSizeIncrement)
441-
md, err := ng.machineapiClient.MachineDeployments(ng.Namespace()).Update(testConfig.machineDeployment)
442-
if err != nil {
443-
t.Fatalf("unexpected error: %v", err)
444-
}
445-
if err := controller.machineDeploymentInformer.Informer().GetStore().Add(md); err != nil {
446-
t.Fatalf("failed to add new machine: %v", err)
437+
if err := controller.machineDeploymentInformer.Informer().GetStore().Add(newUnstructuredFromMachineDeployment(testConfig.machineDeployment)); err != nil {
447438
}
448439
default:
449440
t.Errorf("unexpected type: %T", v)
@@ -815,6 +806,7 @@ func TestNodeGroupMachineSetDeleteNodesWithMismatchedNodes(t *testing.T) {
815806
})
816807
}
817808

809+
func TestNodeGroupDeleteNodesTwice(t *testing.T) {
818810
addDeletionTimestamp := func(t *testing.T, controller *machineController, machine *Machine) error {
819811
// Simulate delete that would have happened if the
820812
// Machine API controllers were running Don't actually
@@ -825,15 +817,50 @@ func TestNodeGroupMachineSetDeleteNodesWithMismatchedNodes(t *testing.T) {
825817
return controller.machineInformer.Informer().GetStore().Update(newUnstructuredFromMachine(machine))
826818
}
827819

820+
test := func(t *testing.T, testConfig *testConfig) {
821+
controller, stop := mustCreateTestController(t, testConfig)
822+
defer stop()
823+
824+
nodegroups, err := controller.nodeGroups()
825+
if err != nil {
826+
t.Fatalf("unexpected error: %v", err)
827+
}
828+
829+
if l := len(nodegroups); l != 1 {
830+
t.Fatalf("expected 1 nodegroup, got %d", l)
831+
}
832+
833+
ng := nodegroups[0]
834+
nodeNames, err := ng.Nodes()
835+
if err != nil {
836+
t.Fatalf("unexpected error: %v", err)
837+
}
838+
839+
if len(nodeNames) != len(testConfig.nodes) {
840+
t.Fatalf("expected len=%v, got len=%v", len(testConfig.nodes), len(nodeNames))
841+
}
842+
843+
sort.SliceStable(nodeNames, func(i, j int) bool {
844+
return nodeNames[i].Id < nodeNames[j].Id
845+
})
846+
847+
for i := 0; i < len(nodeNames); i++ {
848+
if nodeNames[i].Id != testConfig.nodes[i].Spec.ProviderID {
849+
t.Fatalf("expected %q, got %q", testConfig.nodes[i].Spec.ProviderID, nodeNames[i].Id)
850+
}
851+
}
852+
828853
// Assert that we have no DeletionTimestamp
829854
for i := 7; i < len(testConfig.machines); i++ {
830855
if !testConfig.machines[i].ObjectMeta.DeletionTimestamp.IsZero() {
831856
t.Fatalf("unexpected DeletionTimestamp")
832857
}
833858
}
859+
834860
if err := ng.DeleteNodes(testConfig.nodes[7:]); err != nil {
835861
t.Fatalf("unexpected error: %v", err)
836862
}
863+
837864
for i := 7; i < len(testConfig.machines); i++ {
838865
if err := addDeletionTimestamp(t, controller, testConfig.machines[i]); err != nil {
839866
t.Fatalf("unexpected err: %v", err)
@@ -842,6 +869,7 @@ func TestNodeGroupMachineSetDeleteNodesWithMismatchedNodes(t *testing.T) {
842869
t.Fatalf("expected a DeletionTimestamp")
843870
}
844871
}
872+
845873
// TODO(frobware) We have a flaky test here because we
846874
// just called Delete and Update and the next call to
847875
// controller.nodeGroups() will sometimes get stale
@@ -855,7 +883,7 @@ func TestNodeGroupMachineSetDeleteNodesWithMismatchedNodes(t *testing.T) {
855883
//
856884
// Running this test with a 500ms duration I see:
857885
//
858-
// $ ./stress ./clusterapi.test -test.run TestNodeGroupDeleteNodesTwice -test.count 5 | ts | ts -i
886+
// $ ./stress ./openshiftmachineapi.test -test.run TestNodeGroupDeleteNodesTwice -test.count 5 | ts | ts -i
859887
// 00:00:05 Feb 27 14:29:36 0 runs so far, 0 failures
860888
// 00:00:05 Feb 27 14:29:41 8 runs so far, 0 failures
861889
// 00:00:05 Feb 27 14:29:46 16 runs so far, 0 failures
@@ -876,6 +904,7 @@ func TestNodeGroupMachineSetDeleteNodesWithMismatchedNodes(t *testing.T) {
876904
if err != nil {
877905
t.Fatalf("unexpected error: %v", err)
878906
}
907+
879908
ng = nodegroups[0]
880909

881910
// Attempt to delete the nodes again which verifies
@@ -889,7 +918,29 @@ func TestNodeGroupMachineSetDeleteNodesWithMismatchedNodes(t *testing.T) {
889918
if err != nil {
890919
t.Fatalf("unexpected error: %v", err)
891920
}
921+
892922
expectedSize := len(testConfig.machines) - len(testConfig.machines[7:])
893923
if actualSize != expectedSize {
894924
t.Fatalf("expected %d nodes, got %d", expectedSize, actualSize)
895925
}
926+
}
927+
928+
// Note: 10 is an upper bound for the number of nodes/replicas
929+
// Going beyond 10 will break the sorting that happens in the
930+
// test() function because sort.Strings() will not do natural
931+
// sorting and the expected semantics in test() will fail.
932+
933+
t.Run("MachineSet", func(t *testing.T) {
934+
test(t, createMachineSetTestConfig(testNamespace, 10, map[string]string{
935+
nodeGroupMinSizeAnnotationKey: "1",
936+
nodeGroupMaxSizeAnnotationKey: "10",
937+
}))
938+
})
939+
940+
t.Run("MachineDeployment", func(t *testing.T) {
941+
test(t, createMachineDeploymentTestConfig(testNamespace, 10, map[string]string{
942+
nodeGroupMinSizeAnnotationKey: "1",
943+
nodeGroupMaxSizeAnnotationKey: "10",
944+
}))
945+
})
946+
}

0 commit comments

Comments
 (0)