Skip to content

Commit 64a0441

Browse files
authored
Merge pull request kubernetes#76573 from andyzhangx/disk-backoff-refactor
fix detach azure disk back off issue which has too big lock in failure retry condition
2 parents 5f1a3a9 + 6c70ca6 commit 64a0441

File tree

11 files changed

+158
-93
lines changed

11 files changed

+158
-93
lines changed

pkg/cloudprovider/providers/.import-restrictions

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,8 @@
88
"k8s.io/utils/io",
99
"k8s.io/utils/strings",
1010
"k8s.io/utils/exec",
11-
"k8s.io/utils/path"
11+
"k8s.io/utils/path",
12+
"k8s.io/utils/keymutex"
1213
]
1314
},
1415
{

pkg/cloudprovider/providers/azure/BUILD

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ go_library(
6969
"//vendor/github.com/prometheus/client_golang/prometheus:go_default_library",
7070
"//vendor/github.com/rubiojr/go-vhd/vhd:go_default_library",
7171
"//vendor/k8s.io/klog:go_default_library",
72+
"//vendor/k8s.io/utils/keymutex:go_default_library",
7273
"//vendor/sigs.k8s.io/yaml:go_default_library",
7374
],
7475
)
@@ -78,6 +79,7 @@ go_test(
7879
srcs = [
7980
"azure_backoff_test.go",
8081
"azure_cache_test.go",
82+
"azure_controller_common_test.go",
8183
"azure_instances_test.go",
8284
"azure_loadbalancer_test.go",
8385
"azure_metrics_test.go",

pkg/cloudprovider/providers/azure/azure_controller_common.go

Lines changed: 58 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ limitations under the License.
1717
package azure
1818

1919
import (
20+
"context"
2021
"fmt"
2122
"time"
2223

@@ -26,6 +27,7 @@ import (
2627
"k8s.io/apimachinery/pkg/types"
2728
kwait "k8s.io/apimachinery/pkg/util/wait"
2829
cloudprovider "k8s.io/cloud-provider"
30+
"k8s.io/utils/keymutex"
2931
)
3032

3133
const (
@@ -50,6 +52,9 @@ var defaultBackOff = kwait.Backoff{
5052
Jitter: 0.0,
5153
}
5254

55+
// acquire lock to attach/detach disk in one node
56+
var diskOpMutex = keymutex.NewHashed(0)
57+
5358
type controllerCommon struct {
5459
subscriptionID string
5560
location string
@@ -85,24 +90,72 @@ func (c *controllerCommon) getNodeVMSet(nodeName types.NodeName) (VMSet, error)
8590
return ss, nil
8691
}
8792

88-
// AttachDisk attaches a vhd to vm. The vhd must exist, can be identified by diskName, diskURI, and lun.
89-
func (c *controllerCommon) AttachDisk(isManagedDisk bool, diskName, diskURI string, nodeName types.NodeName, lun int32, cachingMode compute.CachingTypes) error {
93+
// AttachDisk attaches a vhd to vm. The vhd must exist, can be identified by diskName, diskURI.
94+
func (c *controllerCommon) AttachDisk(isManagedDisk bool, diskName, diskURI string, nodeName types.NodeName, cachingMode compute.CachingTypes) error {
9095
vmset, err := c.getNodeVMSet(nodeName)
9196
if err != nil {
9297
return err
9398
}
9499

100+
instanceid, err := c.cloud.InstanceID(context.TODO(), nodeName)
101+
if err != nil {
102+
klog.Warningf("failed to get azure instance id (%v)", err)
103+
return fmt.Errorf("failed to get azure instance id for node %q (%v)", nodeName, err)
104+
}
105+
106+
diskOpMutex.LockKey(instanceid)
107+
defer diskOpMutex.UnlockKey(instanceid)
108+
109+
lun, err := c.GetNextDiskLun(nodeName)
110+
if err != nil {
111+
klog.Warningf("no LUN available for instance %q (%v)", nodeName, err)
112+
return fmt.Errorf("all LUNs are used, cannot attach volume (%s, %s) to instance %q (%v)", diskName, diskURI, instanceid, err)
113+
}
114+
115+
klog.V(2).Infof("Trying to attach volume %q lun %d to node %q.", diskURI, lun, nodeName)
95116
return vmset.AttachDisk(isManagedDisk, diskName, diskURI, nodeName, lun, cachingMode)
96117
}
97118

98-
// DetachDiskByName detaches a vhd from host. The vhd can be identified by diskName or diskURI.
99-
func (c *controllerCommon) DetachDiskByName(diskName, diskURI string, nodeName types.NodeName) error {
119+
// DetachDisk detaches a disk from host. The vhd can be identified by diskName or diskURI.
120+
func (c *controllerCommon) DetachDisk(diskName, diskURI string, nodeName types.NodeName) error {
100121
vmset, err := c.getNodeVMSet(nodeName)
101122
if err != nil {
102123
return err
103124
}
104125

105-
return vmset.DetachDiskByName(diskName, diskURI, nodeName)
126+
instanceid, err := c.cloud.InstanceID(context.TODO(), nodeName)
127+
if err != nil {
128+
klog.Warningf("failed to get azure instance id (%v)", err)
129+
return fmt.Errorf("failed to get azure instance id for node %q (%v)", nodeName, err)
130+
}
131+
132+
klog.V(2).Infof("detach %v from node %q", diskURI, nodeName)
133+
134+
// make the lock here as small as possible
135+
diskOpMutex.LockKey(instanceid)
136+
resp, err := vmset.DetachDisk(diskName, diskURI, nodeName)
137+
diskOpMutex.UnlockKey(instanceid)
138+
139+
if c.cloud.CloudProviderBackoff && shouldRetryHTTPRequest(resp, err) {
140+
klog.V(2).Infof("azureDisk - update backing off: detach disk(%s, %s), err: %v", diskName, diskURI, err)
141+
retryErr := kwait.ExponentialBackoff(c.cloud.requestBackoff(), func() (bool, error) {
142+
diskOpMutex.LockKey(instanceid)
143+
resp, err := vmset.DetachDisk(diskName, diskURI, nodeName)
144+
diskOpMutex.UnlockKey(instanceid)
145+
return c.cloud.processHTTPRetryResponse(nil, "", resp, err)
146+
})
147+
if retryErr != nil {
148+
err = retryErr
149+
klog.V(2).Infof("azureDisk - update abort backoff: detach disk(%s, %s), err: %v", diskName, diskURI, err)
150+
}
151+
}
152+
if err != nil {
153+
klog.Errorf("azureDisk - detach disk(%s, %s) failed, err: %v", diskName, diskURI, err)
154+
} else {
155+
klog.V(2).Infof("azureDisk - detach disk(%s, %s) succeeded", diskName, diskURI)
156+
}
157+
158+
return err
106159
}
107160

108161
// getNodeDataDisks invokes vmSet interfaces to get data disks for the node.
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
/*
2+
Copyright 2019 The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package azure
18+
19+
import (
20+
"fmt"
21+
"testing"
22+
23+
"github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2018-10-01/compute"
24+
)
25+
26+
func TestAttachDisk(t *testing.T) {
27+
c := getTestCloud()
28+
29+
common := &controllerCommon{
30+
location: c.Location,
31+
storageEndpointSuffix: c.Environment.StorageEndpointSuffix,
32+
resourceGroup: c.ResourceGroup,
33+
subscriptionID: c.SubscriptionID,
34+
cloud: c,
35+
}
36+
37+
diskURI := fmt.Sprintf("/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Compute/disks/disk-name", c.SubscriptionID, c.ResourceGroup)
38+
39+
err := common.AttachDisk(true, "", diskURI, "node1", compute.CachingTypesReadOnly)
40+
if err != nil {
41+
fmt.Printf("TestAttachDisk return expected error: %v", err)
42+
} else {
43+
t.Errorf("TestAttachDisk unexpected nil err")
44+
}
45+
}
46+
47+
func TestDetachDisk(t *testing.T) {
48+
c := getTestCloud()
49+
50+
common := &controllerCommon{
51+
location: c.Location,
52+
storageEndpointSuffix: c.Environment.StorageEndpointSuffix,
53+
resourceGroup: c.ResourceGroup,
54+
subscriptionID: c.SubscriptionID,
55+
cloud: c,
56+
}
57+
58+
diskURI := fmt.Sprintf("/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Compute/disks/disk-name", c.SubscriptionID, c.ResourceGroup)
59+
60+
err := common.DetachDisk("", diskURI, "node1")
61+
if err != nil {
62+
fmt.Printf("TestAttachDisk return expected error: %v", err)
63+
} else {
64+
t.Errorf("TestAttachDisk unexpected nil err")
65+
}
66+
}

pkg/cloudprovider/providers/azure/azure_controller_standard.go

Lines changed: 8 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ package azure
1818

1919
import (
2020
"fmt"
21+
"net/http"
2122
"strings"
2223

2324
"github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2018-10-01/compute"
@@ -88,28 +89,28 @@ func (as *availabilitySet) AttachDisk(isManagedDisk bool, diskName, diskURI stri
8889
if strings.Contains(detail, errLeaseFailed) || strings.Contains(detail, errDiskBlobNotFound) {
8990
// if lease cannot be acquired or disk not found, immediately detach the disk and return the original error
9091
klog.V(2).Infof("azureDisk - err %v, try detach disk(%s, %s)", err, diskName, diskURI)
91-
as.DetachDiskByName(diskName, diskURI, nodeName)
92+
as.DetachDisk(diskName, diskURI, nodeName)
9293
}
9394
} else {
9495
klog.V(2).Infof("azureDisk - attach disk(%s, %s) succeeded", diskName, diskURI)
9596
}
9697
return err
9798
}
9899

99-
// DetachDiskByName detaches a vhd from host
100+
// DetachDisk detaches a disk from host
100101
// the vhd can be identified by diskName or diskURI
101-
func (as *availabilitySet) DetachDiskByName(diskName, diskURI string, nodeName types.NodeName) error {
102+
func (as *availabilitySet) DetachDisk(diskName, diskURI string, nodeName types.NodeName) (*http.Response, error) {
102103
vm, err := as.getVirtualMachine(nodeName)
103104
if err != nil {
104105
// if host doesn't exist, no need to detach
105106
klog.Warningf("azureDisk - cannot find node %s, skip detaching disk(%s, %s)", nodeName, diskName, diskURI)
106-
return nil
107+
return nil, nil
107108
}
108109

109110
vmName := mapNodeNameToVMName(nodeName)
110111
nodeResourceGroup, err := as.GetNodeResourceGroup(vmName)
111112
if err != nil {
112-
return err
113+
return nil, err
113114
}
114115

115116
disks := *vm.StorageProfile.DataDisks
@@ -127,7 +128,7 @@ func (as *availabilitySet) DetachDiskByName(diskName, diskURI string, nodeName t
127128
}
128129

129130
if !bFoundDisk {
130-
return fmt.Errorf("detach azure disk failure, disk %s not found, diskURI: %s", diskName, diskURI)
131+
return nil, fmt.Errorf("detach azure disk failure, disk %s not found, diskURI: %s", diskName, diskURI)
131132
}
132133

133134
newVM := compute.VirtualMachine{
@@ -146,22 +147,7 @@ func (as *availabilitySet) DetachDiskByName(diskName, diskURI string, nodeName t
146147
// Invalidate the cache right after updating
147148
defer as.cloud.vmCache.Delete(vmName)
148149

149-
resp, err := as.VirtualMachinesClient.CreateOrUpdate(ctx, nodeResourceGroup, vmName, newVM)
150-
if as.CloudProviderBackoff && shouldRetryHTTPRequest(resp, err) {
151-
klog.V(2).Infof("azureDisk - update(%s) backing off: vm(%s) detach disk(%s, %s), err: %v", nodeResourceGroup, vmName, diskName, diskURI, err)
152-
retryErr := as.CreateOrUpdateVMWithRetry(nodeResourceGroup, vmName, newVM)
153-
if retryErr != nil {
154-
err = retryErr
155-
klog.V(2).Infof("azureDisk - update(%s) abort backoff: vm(%s) detach disk(%s, %s), err: %v", nodeResourceGroup, vmName, diskName, diskURI, err)
156-
}
157-
}
158-
if err != nil {
159-
klog.Errorf("azureDisk - detach disk(%s, %s) failed, err: %v", diskName, diskURI, err)
160-
} else {
161-
klog.V(2).Infof("azureDisk - detach disk(%s, %s) succeeded", diskName, diskURI)
162-
}
163-
164-
return err
150+
return as.VirtualMachinesClient.CreateOrUpdate(ctx, nodeResourceGroup, vmName, newVM)
165151
}
166152

167153
// GetDataDisks gets a list of data disks attached to the node.

pkg/cloudprovider/providers/azure/azure_controller_vmss.go

Lines changed: 8 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ package azure
1818

1919
import (
2020
"fmt"
21+
"net/http"
2122
"strings"
2223

2324
"github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2018-10-01/compute"
@@ -93,26 +94,26 @@ func (ss *scaleSet) AttachDisk(isManagedDisk bool, diskName, diskURI string, nod
9394
if strings.Contains(detail, errLeaseFailed) || strings.Contains(detail, errDiskBlobNotFound) {
9495
// if lease cannot be acquired or disk not found, immediately detach the disk and return the original error
9596
klog.Infof("azureDisk - err %s, try detach disk(%s, %s)", detail, diskName, diskURI)
96-
ss.DetachDiskByName(diskName, diskURI, nodeName)
97+
ss.DetachDisk(diskName, diskURI, nodeName)
9798
}
9899
} else {
99100
klog.V(2).Infof("azureDisk - attach disk(%s, %s) succeeded", diskName, diskURI)
100101
}
101102
return err
102103
}
103104

104-
// DetachDiskByName detaches a vhd from host
105+
// DetachDisk detaches a disk from host
105106
// the vhd can be identified by diskName or diskURI
106-
func (ss *scaleSet) DetachDiskByName(diskName, diskURI string, nodeName types.NodeName) error {
107+
func (ss *scaleSet) DetachDisk(diskName, diskURI string, nodeName types.NodeName) (*http.Response, error) {
107108
vmName := mapNodeNameToVMName(nodeName)
108109
ssName, instanceID, vm, err := ss.getVmssVM(vmName)
109110
if err != nil {
110-
return err
111+
return nil, err
111112
}
112113

113114
nodeResourceGroup, err := ss.GetNodeResourceGroup(vmName)
114115
if err != nil {
115-
return err
116+
return nil, err
116117
}
117118

118119
disks := []compute.DataDisk{}
@@ -133,7 +134,7 @@ func (ss *scaleSet) DetachDiskByName(diskName, diskURI string, nodeName types.No
133134
}
134135

135136
if !bFoundDisk {
136-
return fmt.Errorf("detach azure disk failure, disk %s not found, diskURI: %s", diskName, diskURI)
137+
return nil, fmt.Errorf("detach azure disk failure, disk %s not found, diskURI: %s", diskName, diskURI)
137138
}
138139

139140
newVM := compute.VirtualMachineScaleSetVM{
@@ -156,22 +157,7 @@ func (ss *scaleSet) DetachDiskByName(diskName, diskURI string, nodeName types.No
156157
defer ss.vmssVMCache.Delete(key)
157158

158159
klog.V(2).Infof("azureDisk - update(%s): vm(%s) - detach disk(%s, %s)", nodeResourceGroup, nodeName, diskName, diskURI)
159-
resp, err := ss.VirtualMachineScaleSetVMsClient.Update(ctx, nodeResourceGroup, ssName, instanceID, newVM)
160-
if ss.CloudProviderBackoff && shouldRetryHTTPRequest(resp, err) {
161-
klog.V(2).Infof("azureDisk - update(%s) backing off: vm(%s) detach disk(%s, %s), err: %v", nodeResourceGroup, nodeName, diskName, diskURI, err)
162-
retryErr := ss.UpdateVmssVMWithRetry(nodeResourceGroup, ssName, instanceID, newVM)
163-
if retryErr != nil {
164-
err = retryErr
165-
klog.V(2).Infof("azureDisk - update(%s) abort backoff: vm(%s) detach disk(%s, %s), err: %v", nodeResourceGroup, nodeName, diskName, diskURI, err)
166-
}
167-
}
168-
if err != nil {
169-
klog.Errorf("azureDisk - detach disk(%s, %s) from %s failed, err: %v", diskName, diskURI, nodeName, err)
170-
} else {
171-
klog.V(2).Infof("azureDisk - detach disk(%s, %s) succeeded", diskName, diskURI)
172-
}
173-
174-
return err
160+
return ss.VirtualMachineScaleSetVMsClient.Update(ctx, nodeResourceGroup, ssName, instanceID, newVM)
175161
}
176162

177163
// GetDataDisks gets a list of data disks attached to the node.

pkg/cloudprovider/providers/azure/azure_fakes.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -910,8 +910,8 @@ func (f *fakeVMSet) AttachDisk(isManagedDisk bool, diskName, diskURI string, nod
910910
return fmt.Errorf("unimplemented")
911911
}
912912

913-
func (f *fakeVMSet) DetachDiskByName(diskName, diskURI string, nodeName types.NodeName) error {
914-
return fmt.Errorf("unimplemented")
913+
func (f *fakeVMSet) DetachDisk(diskName, diskURI string, nodeName types.NodeName) (*http.Response, error) {
914+
return nil, fmt.Errorf("unimplemented")
915915
}
916916

917917
func (f *fakeVMSet) GetDataDisks(nodeName types.NodeName) ([]compute.DataDisk, error) {

pkg/cloudprovider/providers/azure/azure_vmsets.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ limitations under the License.
1717
package azure
1818

1919
import (
20+
"net/http"
21+
2022
"github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2018-10-01/compute"
2123
"github.com/Azure/azure-sdk-for-go/services/network/mgmt/2017-09-01/network"
2224

@@ -60,8 +62,8 @@ type VMSet interface {
6062

6163
// AttachDisk attaches a vhd to vm. The vhd must exist, can be identified by diskName, diskURI, and lun.
6264
AttachDisk(isManagedDisk bool, diskName, diskURI string, nodeName types.NodeName, lun int32, cachingMode compute.CachingTypes) error
63-
// DetachDiskByName detaches a vhd from host. The vhd can be identified by diskName or diskURI.
64-
DetachDiskByName(diskName, diskURI string, nodeName types.NodeName) error
65+
// DetachDisk detaches a vhd from host. The vhd can be identified by diskName or diskURI.
66+
DetachDisk(diskName, diskURI string, nodeName types.NodeName) (*http.Response, error)
6567
// GetDataDisks gets a list of data disks attached to the node.
6668
GetDataDisks(nodeName types.NodeName) ([]compute.DataDisk, error)
6769

pkg/volume/azure_dd/BUILD

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,6 @@ go_library(
4040
"//vendor/github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2018-10-01/compute:go_default_library",
4141
"//vendor/github.com/Azure/azure-sdk-for-go/services/storage/mgmt/2018-07-01/storage:go_default_library",
4242
"//vendor/k8s.io/klog:go_default_library",
43-
"//vendor/k8s.io/utils/keymutex:go_default_library",
4443
"//vendor/k8s.io/utils/strings:go_default_library",
4544
],
4645
)

0 commit comments

Comments
 (0)