-
Notifications
You must be signed in to change notification settings - Fork 210
chore: when node disk attach limit is reached, the driver should return ResourceExhausted errors. #3237
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
chore: when node disk attach limit is reached, the driver should return ResourceExhausted errors. #3237
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -232,13 +232,13 @@ func (c *controllerCommon) AttachDisk(ctx context.Context, diskName, diskURI str | |
| if int(maxNumDisks) > numDisksAttached { | ||
| numDisksAllowed = int(maxNumDisks) - numDisksAttached | ||
| } else { | ||
| numDisksAllowed = 0 | ||
| return -1, fmt.Errorf("Maximum number of disks %s %d", util.MaximumDataDiskExceededMsg, maxNumDisks) | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| diskMap, err := c.retrieveAttachBatchedDiskRequests(node, diskuri) | ||
| diskMap, err := c.retrieveAttachBatchedDiskRequests(node, diskuri, numDisksAllowed) | ||
| if err != nil { | ||
| return -1, err | ||
| } | ||
|
|
@@ -248,22 +248,6 @@ func (c *controllerCommon) AttachDisk(ctx context.Context, diskName, diskURI str | |
| return c.verifyAttach(ctx, diskName, diskURI, nodeName) | ||
| } | ||
|
|
||
| // Remove some disks from the batch if the number is more than the max number of disks allowed | ||
| removeDisks := len(diskMap) - numDisksAllowed | ||
| if removeDisks > 0 { | ||
| klog.V(2).Infof("too many disks to attach, remove %d disks from the request", removeDisks) | ||
| for diskURI, options := range diskMap { | ||
| if removeDisks == 0 { | ||
| break | ||
| } | ||
| if options != nil { | ||
| klog.V(2).Infof("remove disk(%s) from attach request from node(%s)", diskURI, nodeName) | ||
| delete(diskMap, diskURI) | ||
| } | ||
| removeDisks-- | ||
| } | ||
| } | ||
|
|
||
| lun, setLunErr := c.SetDiskLun(ctx, nodeName, diskuri, diskMap, occupiedLuns) | ||
| if setLunErr != nil { | ||
| return -1, setLunErr | ||
|
|
@@ -333,7 +317,7 @@ func (c *controllerCommon) batchAttachDiskRequest(diskURI, nodeName string, opti | |
|
|
||
| // clean up attach disk requests | ||
| // return original attach disk requests | ||
| func (c *controllerCommon) retrieveAttachBatchedDiskRequests(nodeName, diskURI string) (map[string]*provider.AttachDiskOptions, error) { | ||
| func (c *controllerCommon) retrieveAttachBatchedDiskRequests(nodeName, diskURI string, numDisksAllowed int) (map[string]*provider.AttachDiskOptions, error) { | ||
| var diskMap map[string]*provider.AttachDiskOptions | ||
|
|
||
| attachDiskMapKey := nodeName + attachDiskMapKeySuffix | ||
|
|
@@ -350,7 +334,26 @@ func (c *controllerCommon) retrieveAttachBatchedDiskRequests(nodeName, diskURI s | |
| klog.V(2).Infof("no attach disk(%s) request on node(%s), diskMap len:%d, %+v", diskURI, nodeName, len(diskMap), diskMap) | ||
| return nil, nil | ||
| } | ||
| c.attachDiskMap.Store(nodeName, make(map[string]*provider.AttachDiskOptions)) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. original logic is always clearing up the queue since if removeDisks > 0, keep the remaining disk in the queue would not succeed in the end. and CSI driver has retry logic. clearing up the queue would make the logic more straightfoward.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I really wanted to avoid dropping the disks that haven't been processed, even though I agree that k8s will eventually retry but more changes need to be made to do this safely, so I am adding the cleaning up the queue part back.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. With this approach, the next attach request that comes, would add to this batch of already existing disks in the attachDiskMap? So, we won't create a fresh batch, but reuse the originally dropped disks? |
||
|
|
||
| // Remove disks from the batch if the number is more than the number of disks node can support | ||
| disksToKeepInQueue := make(map[string]*provider.AttachDiskOptions) | ||
| removeDisks := len(diskMap) - numDisksAllowed | ||
| if removeDisks > 0 { | ||
| klog.V(2).Infof("too many disks to attach, remove %d disks from the request", removeDisks) | ||
| for currDiskURI, options := range diskMap { | ||
| if removeDisks == 0 { | ||
| break | ||
| } | ||
| if options != nil && currDiskURI != diskURI { | ||
| klog.V(2).Infof("remove disk(%s) from current batch request from node(%s) but requeue", currDiskURI, nodeName) | ||
| disksToKeepInQueue[currDiskURI] = options | ||
| delete(diskMap, currDiskURI) | ||
| removeDisks-- | ||
| } | ||
| } | ||
| } | ||
|
|
||
| c.attachDiskMap.Store(nodeName, disksToKeepInQueue) | ||
| return diskMap, nil | ||
| } | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -646,32 +646,45 @@ func TestAttachDiskRequest(t *testing.T) { | |||||
| nodeName string | ||||||
| diskName string | ||||||
| diskNum int | ||||||
| numDisksAllowed int | ||||||
| duplicateDiskRequest bool | ||||||
| expectedErr bool | ||||||
| }{ | ||||||
| { | ||||||
| desc: "one disk request in queue", | ||||||
| diskURI: "diskURI", | ||||||
| nodeName: "nodeName", | ||||||
| diskName: "diskName", | ||||||
| diskNum: 1, | ||||||
| expectedErr: false, | ||||||
| desc: "one disk request in queue", | ||||||
| diskURI: "diskURI", | ||||||
| nodeName: "nodeName", | ||||||
| diskName: "diskName", | ||||||
| diskNum: 1, | ||||||
| numDisksAllowed: 8, | ||||||
| expectedErr: false, | ||||||
| }, | ||||||
| { | ||||||
| desc: "multiple disk requests in queue", | ||||||
| diskURI: "diskURI", | ||||||
| nodeName: "nodeName", | ||||||
| diskName: "diskName", | ||||||
| diskNum: 10, | ||||||
| expectedErr: false, | ||||||
| desc: "multiple disk requests in queue", | ||||||
| diskURI: "diskURI", | ||||||
| nodeName: "nodeName", | ||||||
| diskName: "diskName", | ||||||
| diskNum: 10, | ||||||
| numDisksAllowed: 16, | ||||||
| expectedErr: false, | ||||||
| }, | ||||||
| { | ||||||
| desc: "zero disk request in queue", | ||||||
| diskURI: "diskURI", | ||||||
| nodeName: "nodeName", | ||||||
| diskName: "diskName", | ||||||
| diskNum: 0, | ||||||
| expectedErr: false, | ||||||
| desc: "multiple disk requests in queue but exceeds node limit", | ||||||
| diskURI: "diskURI", | ||||||
| nodeName: "nodeName", | ||||||
| diskName: "diskName", | ||||||
| diskNum: 10, | ||||||
| numDisksAllowed: 8, | ||||||
| expectedErr: false, | ||||||
| }, | ||||||
| { | ||||||
| desc: "zero disk request in queue", | ||||||
| diskURI: "diskURI", | ||||||
| nodeName: "nodeName", | ||||||
| diskName: "diskName", | ||||||
| diskNum: 0, | ||||||
| numDisksAllowed: 8, | ||||||
| expectedErr: false, | ||||||
| }, | ||||||
| { | ||||||
| desc: "multiple disk requests in queue", | ||||||
|
|
@@ -680,6 +693,7 @@ func TestAttachDiskRequest(t *testing.T) { | |||||
| diskName: "diskName", | ||||||
| duplicateDiskRequest: true, | ||||||
| diskNum: 10, | ||||||
| numDisksAllowed: 16, | ||||||
| expectedErr: false, | ||||||
| }, | ||||||
| } | ||||||
|
|
@@ -703,9 +717,13 @@ func TestAttachDiskRequest(t *testing.T) { | |||||
| } | ||||||
|
|
||||||
| diskURI := fmt.Sprintf("%s%d", test.diskURI, test.diskNum) | ||||||
| diskMap, err := common.retrieveAttachBatchedDiskRequests(test.nodeName, diskURI) | ||||||
| diskMap, err := common.retrieveAttachBatchedDiskRequests(test.nodeName, diskURI, test.numDisksAllowed) | ||||||
| assert.Equal(t, test.expectedErr, err != nil, "TestCase[%d]: %s", i, test.desc) | ||||||
| assert.Equal(t, test.diskNum, len(diskMap), "TestCase[%d]: %s", i, test.desc) | ||||||
| if test.diskNum > test.numDisksAllowed { | ||||||
| assert.Equal(t, test.numDisksAllowed, len(diskMap), "TestCase[%d]: %s", i, test.desc) | ||||||
| } else { | ||||||
| assert.Equal(t, test.diskNum, len(diskMap), "TestCase[%d]: %s", i, test.desc) | ||||||
| } | ||||||
| for diskURI, opt := range diskMap { | ||||||
| assert.Equal(t, strings.Contains(diskURI, test.diskURI), true, "TestCase[%d]: %s", i, test.desc) | ||||||
| assert.Equal(t, strings.Contains(opt.DiskName, test.diskName), true, "TestCase[%d]: %s", i, test.desc) | ||||||
|
|
@@ -1067,8 +1085,8 @@ func TestConcurrentDetachDisk(t *testing.T) { | |||||
| func(_ context.Context, _ string, name string, params armcompute.VirtualMachine) (*armcompute.VirtualMachine, error) { | ||||||
| if atomic.AddInt32(&callCount, 1) == 1 { | ||||||
| klog.Info("First call to CreateOrUpdate succeeded", "VM Name:", name, "Params:", params) | ||||||
| time.Sleep(100 * time.Millisecond) // Simulate some processing time to hold the node lock while the 3rd detach request is made | ||||||
| return nil, nil // First call succeeds | ||||||
| time.Sleep(1000 * time.Millisecond) // Simulate some processing time to hold the node lock while the 3rd detach request is made | ||||||
|
||||||
| time.Sleep(1000 * time.Millisecond) // Simulate some processing time to hold the node lock while the 3rd detach request is made | |
| time.Sleep(100 * time.Millisecond) // Simulate processing time to hold the node lock while the 3rd detach request is made; 100ms is sufficient for concurrency in tests |
Uh oh!
There was an error while loading. Please reload this page.