Skip to content

Commit 9e8c80b

Browse files
committed
chore: allow active context to issue force detach on timeout
chore: add pkg tgz and fix linter chore: pass min detach time in seconds chore: update helm pkg chore: don't update charts in this PR chore: fix time unit after renaming
1 parent be77419 commit 9e8c80b

File tree

6 files changed

+328
-95
lines changed

6 files changed

+328
-95
lines changed

pkg/azuredisk/azure_controller_common.go

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,9 @@ const (
6161
// default initial delay in milliseconds for batch disk attach/detach
6262
defaultAttachDetachInitialDelayInMs = 1000
6363

64+
// default detach operation timeout
65+
defaultDetachOperationMinTimeoutInSeconds = 240
66+
6467
// WriteAcceleratorEnabled support for Azure Write Accelerator on Azure Disks
6568
// https://docs.microsoft.com/azure/virtual-machines/windows/how-to-enable-write-accelerator
6669
WriteAcceleratorEnabled = "writeacceleratorenabled"
@@ -90,6 +93,8 @@ type controllerCommon struct {
9093
detachDiskMap sync.Map
9194
// DisableDiskLunCheck whether disable disk lun check after disk attach/detach
9295
DisableDiskLunCheck bool
96+
// DetachOperationMinTimeoutInSeconds determines timeout for waiting on detach operation before a force detach
97+
DetachOperationMinTimeoutInSeconds int
9398
// AttachDetachInitialDelayInMs determines initial delay in milliseconds for batch disk attach/detach
9499
AttachDetachInitialDelayInMs int
95100
ForceDetachBackoff bool
@@ -394,14 +399,29 @@ func (c *controllerCommon) DetachDisk(ctx context.Context, diskName, diskURI str
394399

395400
if len(diskMap) == 0 {
396401
// disk was already processed in the batch, return the result
402+
// returns success when the disk has toBeDetached flag set to true
403+
// we could consider issuing a force detach if the disk is stuck in ToBeDetached state for too long
397404
return c.verifyDetach(ctx, diskName, diskURI, nodeName)
398405
}
399406

400407
klog.V(2).Infof("Trying to detach volume %s from node %s, diskMap len:%d, %s", diskURI, nodeName, len(diskMap), diskMap)
401408
c.diskStateMap.Store(formattedDiskURI, "detaching")
402409
defer c.diskStateMap.Delete(formattedDiskURI)
403410

404-
if err = vmset.DetachDisk(ctx, nodeName, diskMap, false); err != nil {
411+
// Calculate timeout for regular detach operation, if operation fails or times out,
412+
// we need to have an active context to issue a force detach
413+
detachContextTimeout := time.Duration(c.DetachOperationMinTimeoutInSeconds) * time.Second
414+
if deadline, ok := ctx.Deadline(); ok {
415+
remaining := time.Until(deadline)
416+
halfRemaining := remaining / 2
417+
if halfRemaining > detachContextTimeout {
418+
detachContextTimeout = halfRemaining
419+
}
420+
}
421+
detachCtx, cancel := context.WithTimeout(ctx, detachContextTimeout)
422+
defer cancel()
423+
424+
if err = vmset.DetachDisk(detachCtx, nodeName, diskMap, false); err != nil {
405425
if isInstanceNotFoundError(err) {
406426
// if host doesn't exist, no need to detach
407427
klog.Warningf("azureDisk - got InstanceNotFoundError(%v), DetachDisk(%s) will assume disk is already detached",

pkg/azuredisk/azure_controller_common_test.go

Lines changed: 199 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,205 @@ func TestCommonAttachDisk(t *testing.T) {
208208
}
209209
}
210210

211+
func TestForceDetach(t *testing.T) {
212+
ctrl := gomock.NewController(t)
213+
defer ctrl.Finish()
214+
215+
testCases := []struct {
216+
desc string
217+
vmList map[string]string
218+
nodeName types.NodeName
219+
diskName string
220+
forceDetachBackoff bool
221+
detachOperationTimeout int
222+
contextTimeout time.Duration
223+
firstDetachError error
224+
forceDetachError error
225+
expectedErr bool
226+
expectForceDetach bool
227+
}{
228+
{
229+
desc: "force detach should be called when regular detach times out",
230+
vmList: map[string]string{"vm1": "PowerState/Running"},
231+
nodeName: "vm1",
232+
diskName: "disk1",
233+
forceDetachBackoff: true,
234+
detachOperationTimeout: 1, // 1s timeout
235+
contextTimeout: 2 * time.Second, // not more than double the detach timeout
236+
firstDetachError: context.DeadlineExceeded,
237+
forceDetachError: nil,
238+
expectedErr: false,
239+
expectForceDetach: true,
240+
},
241+
{
242+
desc: "detach operation timeout of more than half the context timeout should be respected",
243+
vmList: map[string]string{"vm1": "PowerState/Running"},
244+
nodeName: "vm1",
245+
diskName: "disk1",
246+
forceDetachBackoff: true,
247+
detachOperationTimeout: 2, // 2s timeout
248+
contextTimeout: 3 * time.Second, // not more than double the detach timeout
249+
firstDetachError: context.DeadlineExceeded,
250+
forceDetachError: nil,
251+
expectedErr: false,
252+
expectForceDetach: true,
253+
},
254+
{
255+
desc: "force detach should be called with half context timeout when min detach timeout is less than half context timeout",
256+
vmList: map[string]string{"vm1": "PowerState/Running"},
257+
nodeName: "vm1",
258+
diskName: "disk1",
259+
forceDetachBackoff: true,
260+
detachOperationTimeout: 1, // 1s timeout
261+
contextTimeout: 3 * time.Second, // more than double the detach timeout
262+
firstDetachError: context.DeadlineExceeded,
263+
forceDetachError: nil,
264+
expectedErr: false,
265+
expectForceDetach: true,
266+
},
267+
{
268+
desc: "force detach should be called when regular detach fails",
269+
vmList: map[string]string{"vm1": "PowerState/Running"},
270+
nodeName: "vm1",
271+
diskName: "disk1",
272+
forceDetachBackoff: true,
273+
detachOperationTimeout: 1,
274+
contextTimeout: 2 * time.Second,
275+
firstDetachError: errors.New("detach failed"),
276+
forceDetachError: nil,
277+
expectedErr: false,
278+
expectForceDetach: true,
279+
},
280+
{
281+
desc: "should return error when force detach also fails",
282+
vmList: map[string]string{"vm1": "PowerState/Running"},
283+
nodeName: "vm1",
284+
diskName: "disk1",
285+
forceDetachBackoff: true,
286+
detachOperationTimeout: 1,
287+
contextTimeout: 2 * time.Second,
288+
firstDetachError: context.DeadlineExceeded,
289+
forceDetachError: errors.New("force detach failed"),
290+
expectedErr: true,
291+
expectForceDetach: true,
292+
},
293+
{
294+
desc: "force detach should not be called when forceDetachBackoff is false",
295+
vmList: map[string]string{"vm1": "PowerState/Running"},
296+
nodeName: "vm1",
297+
diskName: "disk1",
298+
forceDetachBackoff: false,
299+
detachOperationTimeout: 1,
300+
contextTimeout: 2 * time.Second,
301+
firstDetachError: errors.New("detach failed"),
302+
forceDetachError: nil,
303+
expectedErr: true,
304+
expectForceDetach: false,
305+
},
306+
{
307+
desc: "successful regular detach should not trigger force detach",
308+
vmList: map[string]string{"vm1": "PowerState/Running"},
309+
nodeName: "vm1",
310+
diskName: "disk1",
311+
forceDetachBackoff: true,
312+
detachOperationTimeout: 1,
313+
contextTimeout: 2 * time.Second,
314+
firstDetachError: nil,
315+
forceDetachError: nil,
316+
expectedErr: false,
317+
expectForceDetach: false,
318+
},
319+
}
320+
321+
for i, test := range testCases {
322+
t.Run(test.desc, func(t *testing.T) {
323+
ctx, cancel := context.WithTimeout(context.Background(), test.contextTimeout)
324+
defer cancel()
325+
testCloud := provider.GetTestCloud(ctrl)
326+
common := &controllerCommon{
327+
cloud: testCloud,
328+
lockMap: newLockMap(),
329+
ForceDetachBackoff: test.forceDetachBackoff,
330+
DetachOperationMinTimeoutInSeconds: test.detachOperationTimeout,
331+
DisableDiskLunCheck: true, // Disable lun check to simplify test
332+
}
333+
334+
diskURI := fmt.Sprintf("/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Compute/disks/%s",
335+
testCloud.SubscriptionID, testCloud.ResourceGroup, test.diskName)
336+
337+
expectedVMs := setTestVirtualMachines(testCloud, test.vmList, false)
338+
mockVMClient := testCloud.ComputeClientFactory.GetVirtualMachineClient().(*mockvmclient.MockInterface)
339+
340+
for _, vm := range expectedVMs {
341+
mockVMClient.EXPECT().Get(gomock.Any(), testCloud.ResourceGroup, *vm.Name, gomock.Any()).Return(&vm, nil).AnyTimes()
342+
}
343+
344+
// Set up expectations for CreateOrUpdate calls
345+
callCount := 0
346+
if test.expectForceDetach {
347+
// Expect two calls: regular detach and force detach
348+
mockVMClient.EXPECT().CreateOrUpdate(gomock.Any(), testCloud.ResourceGroup, gomock.Any(), gomock.Any()).DoAndReturn(
349+
func(ctx context.Context, _ string, _ string, vm armcompute.VirtualMachine) (*armcompute.VirtualMachine, error) {
350+
callCount++
351+
if callCount == 1 {
352+
// First call is regular detach
353+
// Verify that context timeout is at least the min detach timeout
354+
contextDeadline, ok := ctx.Deadline()
355+
assert.True(t, ok, "Context should have a deadline")
356+
assert.True(t, contextDeadline.After(time.Now()), "Context deadline should be in the future")
357+
assert.True(t, time.Until(contextDeadline) >= time.Duration(test.detachOperationTimeout)*time.Millisecond-100*time.Millisecond, "Context deadline should exceed min detach timeout.")
358+
assert.True(t, time.Until(contextDeadline) >= test.contextTimeout/2-100*time.Millisecond, "Context deadline should be at least half of context timeout.")
359+
// Simulate timeout by sleeping longer than context deadline
360+
if test.firstDetachError == context.DeadlineExceeded {
361+
time.Sleep(time.Until(contextDeadline.Add(50 * time.Millisecond)))
362+
}
363+
return nil, test.firstDetachError
364+
} else if callCount == 2 {
365+
// Second call is force detach
366+
// Verify force detach parameter is set
367+
if vm.Properties != nil && vm.Properties.StorageProfile != nil {
368+
for _, disk := range vm.Properties.StorageProfile.DataDisks {
369+
if disk.Name != nil && *disk.Name == test.diskName {
370+
assert.NotNil(t, disk.DetachOption, "DetachOption should be set for force detach")
371+
if disk.DetachOption != nil {
372+
assert.Equal(t, armcompute.DiskDetachOptionTypesForceDetach, *disk.DetachOption, "DetachOption should be ForceDetach")
373+
}
374+
}
375+
}
376+
}
377+
return nil, test.forceDetachError
378+
}
379+
return nil, errors.New("unexpected call")
380+
}).Times(2)
381+
} else {
382+
// Expect only one call for regular detach
383+
mockVMClient.EXPECT().CreateOrUpdate(gomock.Any(), testCloud.ResourceGroup, gomock.Any(), gomock.Any()).Return(nil, test.firstDetachError).Times(1)
384+
}
385+
386+
// Create context with custom timeout if specified
387+
testCtx := ctx
388+
if test.contextTimeout > 0 {
389+
testCtx, cancel = context.WithTimeout(ctx, test.contextTimeout)
390+
defer cancel()
391+
}
392+
393+
err := common.DetachDisk(testCtx, test.diskName, diskURI, test.nodeName)
394+
395+
if test.expectedErr {
396+
assert.Error(t, err, "TestCase[%d]: %s", i, test.desc)
397+
} else {
398+
assert.NoError(t, err, "TestCase[%d]: %s", i, test.desc)
399+
}
400+
401+
if test.expectForceDetach {
402+
assert.Equal(t, 2, callCount, "TestCase[%d]: %s - Expected force detach to be called", i, test.desc)
403+
} else {
404+
assert.LessOrEqual(t, callCount, 1, "TestCase[%d]: %s - Force detach should not be called", i, test.desc)
405+
}
406+
})
407+
}
408+
}
409+
211410
func TestCommonDetachDisk(t *testing.T) {
212411
ctrl := gomock.NewController(t)
213412
defer ctrl.Finish()

pkg/azuredisk/azure_managedDiskController.go

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -49,10 +49,11 @@ type ManagedDiskController struct {
4949

5050
func NewManagedDiskController(provider *provider.Cloud) *ManagedDiskController {
5151
common := &controllerCommon{
52-
cloud: provider,
53-
lockMap: newLockMap(),
54-
AttachDetachInitialDelayInMs: defaultAttachDetachInitialDelayInMs,
55-
clientFactory: provider.ComputeClientFactory,
52+
cloud: provider,
53+
lockMap: newLockMap(),
54+
AttachDetachInitialDelayInMs: defaultAttachDetachInitialDelayInMs,
55+
DetachOperationMinTimeoutInSeconds: defaultDetachOperationMinTimeoutInSeconds,
56+
clientFactory: provider.ComputeClientFactory,
5657
}
5758
getter := func(_ context.Context, _ string) (interface{}, error) { return nil, nil }
5859
common.hitMaxDataDiskCountCache, _ = azcache.NewTimedCache(5*time.Minute, getter, false)

pkg/azuredisk/azure_managedDiskController_test.go

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -265,10 +265,11 @@ func TestCreateManagedDisk(t *testing.T) {
265265
testCloud := provider.GetTestCloud(ctrl)
266266

267267
common := &controllerCommon{
268-
cloud: testCloud,
269-
lockMap: newLockMap(),
270-
AttachDetachInitialDelayInMs: defaultAttachDetachInitialDelayInMs,
271-
clientFactory: testCloud.ComputeClientFactory,
268+
cloud: testCloud,
269+
lockMap: newLockMap(),
270+
AttachDetachInitialDelayInMs: defaultAttachDetachInitialDelayInMs,
271+
DetachOperationMinTimeoutInSeconds: defaultDetachOperationMinTimeoutInSeconds,
272+
clientFactory: testCloud.ComputeClientFactory,
272273
}
273274

274275
managedDiskController := &ManagedDiskController{common}
@@ -330,10 +331,11 @@ func TestCreateManagedDiskWithExtendedLocation(t *testing.T) {
330331
}
331332

332333
common := &controllerCommon{
333-
cloud: testCloud,
334-
lockMap: newLockMap(),
335-
AttachDetachInitialDelayInMs: defaultAttachDetachInitialDelayInMs,
336-
clientFactory: testCloud.ComputeClientFactory,
334+
cloud: testCloud,
335+
lockMap: newLockMap(),
336+
AttachDetachInitialDelayInMs: defaultAttachDetachInitialDelayInMs,
337+
DetachOperationMinTimeoutInSeconds: defaultDetachOperationMinTimeoutInSeconds,
338+
clientFactory: testCloud.ComputeClientFactory,
337339
}
338340

339341
managedDiskController := &ManagedDiskController{common}
@@ -397,10 +399,11 @@ func TestDeleteManagedDisk(t *testing.T) {
397399
testCloud := provider.GetTestCloud(ctrl)
398400

399401
common := &controllerCommon{
400-
cloud: testCloud,
401-
lockMap: newLockMap(),
402-
AttachDetachInitialDelayInMs: defaultAttachDetachInitialDelayInMs,
403-
clientFactory: testCloud.ComputeClientFactory,
402+
cloud: testCloud,
403+
lockMap: newLockMap(),
404+
AttachDetachInitialDelayInMs: defaultAttachDetachInitialDelayInMs,
405+
DetachOperationMinTimeoutInSeconds: defaultDetachOperationMinTimeoutInSeconds,
406+
clientFactory: testCloud.ComputeClientFactory,
404407
}
405408

406409
managedDiskController := &ManagedDiskController{common}

0 commit comments

Comments
 (0)