@@ -2105,17 +2105,18 @@ func (c *Cloud) applyUnSchedulableTaint(nodeName types.NodeName, reason string)
2105
2105
2106
2106
// waitForAttachmentStatus polls until the attachment status is the expected value
2107
2107
// On success, it returns the last attachment state.
2108
- func (d * awsDisk ) waitForAttachmentStatus (status string ) (* ec2.VolumeAttachment , error ) {
2108
+ func (d * awsDisk ) waitForAttachmentStatus (status string , expectedInstance , expectedDevice string ) (* ec2.VolumeAttachment , error ) {
2109
2109
backoff := wait.Backoff {
2110
2110
Duration : volumeAttachmentStatusPollDelay ,
2111
2111
Factor : volumeAttachmentStatusFactor ,
2112
2112
Steps : volumeAttachmentStatusSteps ,
2113
2113
}
2114
2114
2115
- // Because of rate limiting, we often see errors from describeVolume
2115
+ // Because of rate limiting, we often see errors from describeVolume.
2116
+ // Or AWS eventual consistency returns unexpected data.
2116
2117
// So we tolerate a limited number of failures.
2117
- // But once we see more than 10 errors in a row, we return the error
2118
- describeErrorCount := 0
2118
+ // But once we see more than 10 errors in a row, we return the error.
2119
+ errorCount := 0
2119
2120
2120
2121
// Attach/detach usually takes time. It does not make sense to start
2121
2122
// polling DescribeVolumes before some initial delay to let AWS
@@ -2144,8 +2145,8 @@ func (d *awsDisk) waitForAttachmentStatus(status string) (*ec2.VolumeAttachment,
2144
2145
return false , err
2145
2146
}
2146
2147
}
2147
- describeErrorCount ++
2148
- if describeErrorCount > volumeAttachmentStatusConsecutiveErrorLimit {
2148
+ errorCount ++
2149
+ if errorCount > volumeAttachmentStatusConsecutiveErrorLimit {
2149
2150
// report the error
2150
2151
return false , err
2151
2152
}
@@ -2154,8 +2155,6 @@ func (d *awsDisk) waitForAttachmentStatus(status string) (*ec2.VolumeAttachment,
2154
2155
return false , nil
2155
2156
}
2156
2157
2157
- describeErrorCount = 0
2158
-
2159
2158
if len (info .Attachments ) > 1 {
2160
2159
// Shouldn't happen; log so we know if it is
2161
2160
klog .Warningf ("Found multiple attachments for volume %q: %v" , d .awsID , info )
@@ -2177,11 +2176,39 @@ func (d *awsDisk) waitForAttachmentStatus(status string) (*ec2.VolumeAttachment,
2177
2176
if attachmentStatus == "" {
2178
2177
attachmentStatus = "detached"
2179
2178
}
2179
+ if attachment != nil {
2180
+ // AWS eventual consistency can go back in time.
2181
+ // For example, we're waiting for a volume to be attached as /dev/xvdba, but AWS can tell us it's
2182
+ // attached as /dev/xvdbb, where it was attached before and it was already detached.
2183
+ // Retry couple of times, hoping AWS starts reporting the right status.
2184
+ device := aws .StringValue (attachment .Device )
2185
+ if expectedDevice != "" && device != "" && device != expectedDevice {
2186
+ klog .Warningf ("Expected device %s %s for volume %s, but found device %s %s" , expectedDevice , status , d .name , device , attachmentStatus )
2187
+ errorCount ++
2188
+ if errorCount > volumeAttachmentStatusConsecutiveErrorLimit {
2189
+ // report the error
2190
+ return false , fmt .Errorf ("attachment of disk %q failed: requested device %q but found %q" , d .name , expectedDevice , device )
2191
+ }
2192
+ return false , nil
2193
+ }
2194
+ instanceID := aws .StringValue (attachment .InstanceId )
2195
+ if expectedInstance != "" && instanceID != "" && instanceID != expectedInstance {
2196
+ klog .Warningf ("Expected instance %s/%s for volume %s, but found instance %s/%s" , expectedInstance , status , d .name , instanceID , attachmentStatus )
2197
+ errorCount ++
2198
+ if errorCount > volumeAttachmentStatusConsecutiveErrorLimit {
2199
+ // report the error
2200
+ return false , fmt .Errorf ("attachment of disk %q failed: requested device %q but found %q" , d .name , expectedDevice , device )
2201
+ }
2202
+ return false , nil
2203
+ }
2204
+ }
2205
+
2180
2206
if attachmentStatus == status {
2181
2207
// Attachment is in requested state, finish waiting
2182
2208
return true , nil
2183
2209
}
2184
2210
// continue waiting
2211
+ errorCount = 0
2185
2212
klog .V (2 ).Infof ("Waiting for volume %q state: actual=%s, desired=%s" , d .awsID , attachmentStatus , status )
2186
2213
return false , nil
2187
2214
})
@@ -2321,7 +2348,7 @@ func (c *Cloud) AttachDisk(diskName KubernetesVolumeID, nodeName types.NodeName)
2321
2348
klog .V (2 ).Infof ("AttachVolume volume=%q instance=%q request returned %v" , disk .awsID , awsInstance .awsID , attachResponse )
2322
2349
}
2323
2350
2324
- attachment , err := disk .waitForAttachmentStatus ("attached" )
2351
+ attachment , err := disk .waitForAttachmentStatus ("attached" , awsInstance . awsID , ec2Device )
2325
2352
2326
2353
if err != nil {
2327
2354
if err == wait .ErrWaitTimeout {
@@ -2341,6 +2368,7 @@ func (c *Cloud) AttachDisk(diskName KubernetesVolumeID, nodeName types.NodeName)
2341
2368
return "" , fmt .Errorf ("unexpected state: attachment nil after attached %q to %q" , diskName , nodeName )
2342
2369
}
2343
2370
if ec2Device != aws .StringValue (attachment .Device ) {
2371
+ // Already checked in waitForAttachmentStatus(), but just to be sure...
2344
2372
return "" , fmt .Errorf ("disk attachment of %q to %q failed: requested device %q but found %q" , diskName , nodeName , ec2Device , aws .StringValue (attachment .Device ))
2345
2373
}
2346
2374
if awsInstance .awsID != aws .StringValue (attachment .InstanceId ) {
@@ -2398,7 +2426,7 @@ func (c *Cloud) DetachDisk(diskName KubernetesVolumeID, nodeName types.NodeName)
2398
2426
return "" , errors .New ("no response from DetachVolume" )
2399
2427
}
2400
2428
2401
- attachment , err := diskInfo .disk .waitForAttachmentStatus ("detached" )
2429
+ attachment , err := diskInfo .disk .waitForAttachmentStatus ("detached" , awsInstance . awsID , "" )
2402
2430
if err != nil {
2403
2431
return "" , err
2404
2432
}
0 commit comments