Skip to content

Commit ee7cd25

Browse files
committed
Azure: fix node removal race condition on VMSS deletion
When a VMSS is being deleted, instances are removed first. The VMSS itself will disappear once empty. That delay is generally enough for kube-controller-manager to delete the corresponding k8s nodes, but might not when busy or throttled (for instance). If kubernetes nodes remains after their backing VMSS were removed, Azure cloud-provider will fail listing that VMSS VMs, and downstream callers (ie. `InstanceExistsByProviderID`) won't account those errors for a missing instance. The nodes will remain (still considered as "existing"), and controller-manager will indefinitely retry to VMSS VMs list it, draining API calls quotas, potentially causing throttling. In practice a missing scale set implies instances attributed to that VMSS don't exists either: `InstanceExistsByProviderID` (part of the general cloud provider interface) should return false in that case.
1 parent 0d1ac16 commit ee7cd25

File tree

3 files changed

+66
-0
lines changed

3 files changed

+66
-0
lines changed

staging/src/k8s.io/legacy-cloud-providers/azure/azure_instances_test.go

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,8 @@ import (
3838
"k8s.io/legacy-cloud-providers/azure/clients/interfaceclient/mockinterfaceclient"
3939
"k8s.io/legacy-cloud-providers/azure/clients/publicipclient/mockpublicipclient"
4040
"k8s.io/legacy-cloud-providers/azure/clients/vmclient/mockvmclient"
41+
"k8s.io/legacy-cloud-providers/azure/clients/vmssclient/mockvmssclient"
42+
"k8s.io/legacy-cloud-providers/azure/clients/vmssvmclient/mockvmssvmclient"
4143
"k8s.io/legacy-cloud-providers/azure/retry"
4244
)
4345

@@ -633,6 +635,58 @@ func TestInstanceExistsByProviderID(t *testing.T) {
633635
assert.Equal(t, test.expectedErrMsg, err, test.name)
634636
assert.Equal(t, test.expected, exist, test.name)
635637
}
638+
639+
vmssTestCases := []struct {
640+
name string
641+
providerID string
642+
scaleSet string
643+
vmList []string
644+
expected bool
645+
rerr *retry.Error
646+
}{
647+
{
648+
name: "InstanceExistsByProviderID should return true if VMSS and VM exist",
649+
providerID: "azure:///subscriptions/script/resourceGroups/rg/providers/Microsoft.Compute/virtualMachineScaleSets/vmssee6c2/virtualMachines/0",
650+
scaleSet: "vmssee6c2",
651+
vmList: []string{"vmssee6c2000000"},
652+
expected: true,
653+
},
654+
{
655+
name: "InstanceExistsByProviderID should return false if VMSS exist but VM doesn't",
656+
providerID: "azure:///subscriptions/script/resourceGroups/rg/providers/Microsoft.Compute/virtualMachineScaleSets/vmssee6c2/virtualMachines/0",
657+
scaleSet: "vmssee6c2",
658+
expected: false,
659+
},
660+
{
661+
name: "InstanceExistsByProviderID should return false if VMSS doesn't exist",
662+
providerID: "azure:///subscriptions/script/resourceGroups/rg/providers/Microsoft.Compute/virtualMachineScaleSets/missing-vmss/virtualMachines/0",
663+
rerr: &retry.Error{HTTPStatusCode: 404},
664+
expected: false,
665+
},
666+
}
667+
668+
for _, test := range vmssTestCases {
669+
ss, err := newTestScaleSet(ctrl)
670+
assert.NoError(t, err, test.name)
671+
cloud.VMSet = ss
672+
673+
mockVMSSClient := mockvmssclient.NewMockInterface(ctrl)
674+
mockVMSSVMClient := mockvmssvmclient.NewMockInterface(ctrl)
675+
ss.cloud.VirtualMachineScaleSetsClient = mockVMSSClient
676+
ss.cloud.VirtualMachineScaleSetVMsClient = mockVMSSVMClient
677+
678+
expectedScaleSet := buildTestVMSS(test.scaleSet, test.scaleSet)
679+
mockVMSSClient.EXPECT().List(gomock.Any(), gomock.Any()).Return([]compute.VirtualMachineScaleSet{expectedScaleSet}, test.rerr).AnyTimes()
680+
681+
expectedVMs, _, _ := buildTestVirtualMachineEnv(ss.cloud, test.scaleSet, "", 0, test.vmList, "succeeded", false)
682+
mockVMSSVMClient.EXPECT().List(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(expectedVMs, test.rerr).AnyTimes()
683+
684+
mockVMsClient := ss.cloud.VirtualMachinesClient.(*mockvmclient.MockInterface)
685+
mockVMsClient.EXPECT().List(gomock.Any(), gomock.Any()).Return([]compute.VirtualMachine{}, nil).AnyTimes()
686+
687+
exist, _ := cloud.InstanceExistsByProviderID(context.Background(), test.providerID)
688+
assert.Equal(t, test.expected, exist, test.name)
689+
}
636690
}
637691

638692
func TestNodeAddressesByProviderID(t *testing.T) {

staging/src/k8s.io/legacy-cloud-providers/azure/azure_vmss.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -697,6 +697,9 @@ func (ss *scaleSet) listScaleSetVMs(scaleSetName, resourceGroup string) ([]compu
697697
allVMs, rerr := ss.VirtualMachineScaleSetVMsClient.List(ctx, resourceGroup, scaleSetName, string(compute.InstanceView))
698698
if rerr != nil {
699699
klog.Errorf("VirtualMachineScaleSetVMsClient.List failed: %v", rerr)
700+
if rerr.IsNotFound() {
701+
return nil, cloudprovider.InstanceNotFound
702+
}
700703
return nil, rerr.Error()
701704
}
702705

staging/src/k8s.io/legacy-cloud-providers/azure/retry/azure_error.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,15 @@ func (err *Error) IsThrottled() bool {
8989
return err.HTTPStatusCode == http.StatusTooManyRequests || err.RetryAfter.After(now())
9090
}
9191

92+
// IsNotFound returns true the if the requested object wasn't found
93+
func (err *Error) IsNotFound() bool {
94+
if err == nil {
95+
return false
96+
}
97+
98+
return err.HTTPStatusCode == http.StatusNotFound
99+
}
100+
92101
// NewError creates a new Error.
93102
func NewError(retriable bool, err error) *Error {
94103
return &Error{

0 commit comments

Comments
 (0)