@@ -2105,17 +2105,18 @@ func (c *Cloud) applyUnSchedulableTaint(nodeName types.NodeName, reason string)
2105
2105
2106
2106
// waitForAttachmentStatus polls until the attachment status is the expected value
2107
2107
// On success, it returns the last attachment state.
2108
- func (d * awsDisk ) waitForAttachmentStatus (status string ) (* ec2.VolumeAttachment , error ) {
2108
+ func (d * awsDisk ) waitForAttachmentStatus (status string , expectedDevice string ) (* ec2.VolumeAttachment , error ) {
2109
2109
backoff := wait.Backoff {
2110
2110
Duration : volumeAttachmentStatusPollDelay ,
2111
2111
Factor : volumeAttachmentStatusFactor ,
2112
2112
Steps : volumeAttachmentStatusSteps ,
2113
2113
}
2114
2114
2115
- // Because of rate limiting, we often see errors from describeVolume
2115
+ // Because of rate limiting, we often see errors from describeVolume.
2116
+ // Or AWS eventual consistency returns unexpected data.
2116
2117
// So we tolerate a limited number of failures.
2117
- // But once we see more than 10 errors in a row, we return the error
2118
- describeErrorCount := 0
2118
+ // But once we see more than 10 errors in a row, we return the error.
2119
+ errorCount := 0
2119
2120
2120
2121
// Attach/detach usually takes time. It does not make sense to start
2121
2122
// polling DescribeVolumes before some initial delay to let AWS
@@ -2144,8 +2145,8 @@ func (d *awsDisk) waitForAttachmentStatus(status string) (*ec2.VolumeAttachment,
2144
2145
return false , err
2145
2146
}
2146
2147
}
2147
- describeErrorCount ++
2148
- if describeErrorCount > volumeAttachmentStatusConsecutiveErrorLimit {
2148
+ errorCount ++
2149
+ if errorCount > volumeAttachmentStatusConsecutiveErrorLimit {
2149
2150
// report the error
2150
2151
return false , err
2151
2152
}
@@ -2154,8 +2155,6 @@ func (d *awsDisk) waitForAttachmentStatus(status string) (*ec2.VolumeAttachment,
2154
2155
return false , nil
2155
2156
}
2156
2157
2157
- describeErrorCount = 0
2158
-
2159
2158
if len (info .Attachments ) > 1 {
2160
2159
// Shouldn't happen; log so we know if it is
2161
2160
klog .Warningf ("Found multiple attachments for volume %q: %v" , d .awsID , info )
@@ -2177,11 +2176,29 @@ func (d *awsDisk) waitForAttachmentStatus(status string) (*ec2.VolumeAttachment,
2177
2176
if attachmentStatus == "" {
2178
2177
attachmentStatus = "detached"
2179
2178
}
2179
+ if attachment != nil && expectedDevice != "" {
2180
+ device := aws .StringValue (attachment .Device )
2181
+ if device != "" && device != expectedDevice {
2182
+ // AWS eventual consistency can go back in time.
2183
+ // For example, we're waiting for a volume to be attached as /dev/xvdba, but AWS can tell us it's
2184
+ // attached as /dev/xvdbb, where it was attached before and it was already detached.
2185
+ // Retry couple of times, hoping AWS starts reporting the right status.
2186
+ klog .Warningf ("Expected device %s %s, but found device %s %s" , expectedDevice , status , device , attachmentStatus )
2187
+ errorCount ++
2188
+ if errorCount > volumeAttachmentStatusConsecutiveErrorLimit {
2189
+ // report the error
2190
+ return false , fmt .Errorf ("attachment of disk %q failed: requested device %q but found %q" , d .name , expectedDevice , device )
2191
+ }
2192
+ return false , nil
2193
+ }
2194
+ }
2195
+
2180
2196
if attachmentStatus == status {
2181
2197
// Attachment is in requested state, finish waiting
2182
2198
return true , nil
2183
2199
}
2184
2200
// continue waiting
2201
+ errorCount = 0
2185
2202
klog .V (2 ).Infof ("Waiting for volume %q state: actual=%s, desired=%s" , d .awsID , attachmentStatus , status )
2186
2203
return false , nil
2187
2204
})
@@ -2321,7 +2338,7 @@ func (c *Cloud) AttachDisk(diskName KubernetesVolumeID, nodeName types.NodeName)
2321
2338
klog .V (2 ).Infof ("AttachVolume volume=%q instance=%q request returned %v" , disk .awsID , awsInstance .awsID , attachResponse )
2322
2339
}
2323
2340
2324
- attachment , err := disk .waitForAttachmentStatus ("attached" )
2341
+ attachment , err := disk .waitForAttachmentStatus ("attached" , ec2Device )
2325
2342
2326
2343
if err != nil {
2327
2344
if err == wait .ErrWaitTimeout {
@@ -2341,6 +2358,7 @@ func (c *Cloud) AttachDisk(diskName KubernetesVolumeID, nodeName types.NodeName)
2341
2358
return "" , fmt .Errorf ("unexpected state: attachment nil after attached %q to %q" , diskName , nodeName )
2342
2359
}
2343
2360
if ec2Device != aws .StringValue (attachment .Device ) {
2361
+ // Already checked in waitForAttachmentStatus(), but just to be sure...
2344
2362
return "" , fmt .Errorf ("disk attachment of %q to %q failed: requested device %q but found %q" , diskName , nodeName , ec2Device , aws .StringValue (attachment .Device ))
2345
2363
}
2346
2364
if awsInstance .awsID != aws .StringValue (attachment .InstanceId ) {
@@ -2398,7 +2416,7 @@ func (c *Cloud) DetachDisk(diskName KubernetesVolumeID, nodeName types.NodeName)
2398
2416
return "" , errors .New ("no response from DetachVolume" )
2399
2417
}
2400
2418
2401
- attachment , err := diskInfo .disk .waitForAttachmentStatus ("detached" )
2419
+ attachment , err := diskInfo .disk .waitForAttachmentStatus ("detached" , "" )
2402
2420
if err != nil {
2403
2421
return "" , err
2404
2422
}
0 commit comments