Skip to content

Commit 9a6e35a

Browse files
authored
Merge pull request kubernetes#95748 from nilo19/bug/fix-lb-update-failed-pip
Update the PIP when it is not in the Succeeded provisioning state during the LB update.
2 parents 5d49a62 + 46593da commit 9a6e35a

File tree

2 files changed

+56
-7
lines changed

2 files changed

+56
-7
lines changed

staging/src/k8s.io/legacy-cloud-providers/azure/azure_backoff.go

Lines changed: 41 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ package azure
2020

2121
import (
2222
"net/http"
23+
"regexp"
2324
"strings"
2425

2526
"github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2019-12-01/compute"
@@ -44,6 +45,12 @@ const (
4445
operationCanceledErrorMessage = "canceledandsupersededduetoanotheroperation"
4546

4647
cannotDeletePublicIPErrorMessageCode = "PublicIPAddressCannotBeDeleted"
48+
49+
referencedResourceNotProvisionedMessageCode = "ReferencedResourceNotProvisioned"
50+
)
51+
52+
var (
53+
pipErrorMessageRE = regexp.MustCompile(`(?:.*)/subscriptions/(?:.*)/resourceGroups/(.*)/providers/Microsoft.Network/publicIPAddresses/([^\s]+)(?:.*)`)
4754
)
4855

4956
// RequestBackoff if backoff is disabled in cloud provider it
@@ -182,7 +189,7 @@ func (az *Cloud) CreateOrUpdateLB(service *v1.Service, lb network.LoadBalancer)
182189
defer cancel()
183190

184191
rgName := az.getLoadBalancerResourceGroup()
185-
rerr := az.LoadBalancerClient.CreateOrUpdate(ctx, rgName, *lb.Name, lb, to.String(lb.Etag))
192+
rerr := az.LoadBalancerClient.CreateOrUpdate(ctx, rgName, to.String(lb.Name), lb, to.String(lb.Etag))
186193
klog.V(10).Infof("LoadBalancerClient.CreateOrUpdate(%s): end", *lb.Name)
187194
if rerr == nil {
188195
// Invalidate the cache right after updating
@@ -192,12 +199,39 @@ func (az *Cloud) CreateOrUpdateLB(service *v1.Service, lb network.LoadBalancer)
192199

193200
// Invalidate the cache because ETAG precondition mismatch.
194201
if rerr.HTTPStatusCode == http.StatusPreconditionFailed {
195-
klog.V(3).Infof("LoadBalancer cache for %s is cleanup because of http.StatusPreconditionFailed", *lb.Name)
202+
klog.V(3).Infof("LoadBalancer cache for %s is cleanup because of http.StatusPreconditionFailed", to.String(lb.Name))
196203
az.lbCache.Delete(*lb.Name)
197204
}
205+
206+
retryErrorMessage := rerr.Error().Error()
198207
// Invalidate the cache because another new operation has canceled the current request.
199-
if strings.Contains(strings.ToLower(rerr.Error().Error()), operationCanceledErrorMessage) {
200-
klog.V(3).Infof("LoadBalancer cache for %s is cleanup because CreateOrUpdate is canceled by another operation", *lb.Name)
208+
if strings.Contains(strings.ToLower(retryErrorMessage), operationCanceledErrorMessage) {
209+
klog.V(3).Infof("LoadBalancer cache for %s is cleanup because CreateOrUpdate is canceled by another operation", to.String(lb.Name))
210+
az.lbCache.Delete(*lb.Name)
211+
}
212+
213+
// The LB update may fail because the referenced PIP is not in the Succeeded provisioning state
214+
if strings.Contains(strings.ToLower(retryErrorMessage), strings.ToLower(referencedResourceNotProvisionedMessageCode)) {
215+
matches := pipErrorMessageRE.FindStringSubmatch(retryErrorMessage)
216+
if len(matches) != 3 {
217+
klog.Warningf("Failed to parse the retry error message %s", retryErrorMessage)
218+
return rerr.Error()
219+
}
220+
pipRG, pipName := matches[1], matches[2]
221+
klog.V(3).Infof("The public IP %s referenced by load balancer %s is not in Succeeded provisioning state, will try to update it", pipName, to.String(lb.Name))
222+
pip, _, err := az.getPublicIPAddress(pipRG, pipName)
223+
if err != nil {
224+
klog.Warningf("Failed to get the public IP %s in resource group %s: %v", pipName, pipRG, err)
225+
return rerr.Error()
226+
}
227+
// Perform a dummy update to fix the provisioning state
228+
err = az.CreateOrUpdatePIP(service, pipRG, pip)
229+
if err != nil {
230+
klog.Warningf("Failed to update the public IP %s in resource group %s: %v", pipName, pipRG, err)
231+
return rerr.Error()
232+
}
233+
// Invalidate the LB cache, return the error, and the controller manager
234+
// would retry the LB update in the next reconcile loop
201235
az.lbCache.Delete(*lb.Name)
202236
}
203237

@@ -241,10 +275,10 @@ func (az *Cloud) CreateOrUpdatePIP(service *v1.Service, pipResourceGroup string,
241275
ctx, cancel := getContextWithCancel()
242276
defer cancel()
243277

244-
rerr := az.PublicIPAddressesClient.CreateOrUpdate(ctx, pipResourceGroup, *pip.Name, pip)
245-
klog.V(10).Infof("PublicIPAddressesClient.CreateOrUpdate(%s, %s): end", pipResourceGroup, *pip.Name)
278+
rerr := az.PublicIPAddressesClient.CreateOrUpdate(ctx, pipResourceGroup, to.String(pip.Name), pip)
279+
klog.V(10).Infof("PublicIPAddressesClient.CreateOrUpdate(%s, %s): end", pipResourceGroup, to.String(pip.Name))
246280
if rerr != nil {
247-
klog.Errorf("PublicIPAddressesClient.CreateOrUpdate(%s, %s) failed: %s", pipResourceGroup, *pip.Name, rerr.Error().Error())
281+
klog.Errorf("PublicIPAddressesClient.CreateOrUpdate(%s, %s) failed: %s", pipResourceGroup, to.String(pip.Name), rerr.Error().Error())
248282
az.Event(service, v1.EventTypeWarning, "CreateOrUpdatePublicIPAddress", rerr.Error().Error())
249283
return rerr.Error()
250284
}

staging/src/k8s.io/legacy-cloud-providers/azure/azure_backoff_test.go

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -240,6 +240,8 @@ func TestCreateOrUpdateLB(t *testing.T) {
240240
ctrl := gomock.NewController(t)
241241
defer ctrl.Finish()
242242

243+
referencedResourceNotProvisionedRawErrorString := `Code="ReferencedResourceNotProvisioned" Message="Cannot proceed with operation because resource /subscriptions/sub/resourceGroups/rg/providers/Microsoft.Network/publicIPAddresses/pip used by resource /subscriptions/sub/resourceGroups/rg/providers/Microsoft.Network/loadBalancers/lb is not in Succeeded state. Resource is in Failed state and the last operation that updated/is updating the resource is PutPublicIpAddressOperation."`
244+
243245
tests := []struct {
244246
clientErr *retry.Error
245247
expectedErr error
@@ -252,6 +254,10 @@ func TestCreateOrUpdateLB(t *testing.T) {
252254
clientErr: &retry.Error{RawError: fmt.Errorf(operationCanceledErrorMessage)},
253255
expectedErr: fmt.Errorf("Retriable: false, RetryAfter: 0s, HTTPStatusCode: 0, RawError: canceledandsupersededduetoanotheroperation"),
254256
},
257+
{
258+
clientErr: &retry.Error{RawError: fmt.Errorf(referencedResourceNotProvisionedRawErrorString)},
259+
expectedErr: fmt.Errorf("Retriable: false, RetryAfter: 0s, HTTPStatusCode: 0, RawError: %s", referencedResourceNotProvisionedRawErrorString),
260+
},
255261
}
256262

257263
for _, test := range tests {
@@ -262,6 +268,15 @@ func TestCreateOrUpdateLB(t *testing.T) {
262268
mockLBClient.EXPECT().CreateOrUpdate(gomock.Any(), az.ResourceGroup, gomock.Any(), gomock.Any(), gomock.Any()).Return(test.clientErr)
263269
mockLBClient.EXPECT().Get(gomock.Any(), az.ResourceGroup, "lb", gomock.Any()).Return(network.LoadBalancer{}, nil)
264270

271+
mockPIPClient := az.PublicIPAddressesClient.(*mockpublicipclient.MockInterface)
272+
mockPIPClient.EXPECT().CreateOrUpdate(gomock.Any(), az.ResourceGroup, "pip", gomock.Any()).Return(nil).AnyTimes()
273+
mockPIPClient.EXPECT().Get(gomock.Any(), az.ResourceGroup, "pip", gomock.Any()).Return(network.PublicIPAddress{
274+
Name: to.StringPtr("pip"),
275+
PublicIPAddressPropertiesFormat: &network.PublicIPAddressPropertiesFormat{
276+
ProvisioningState: to.StringPtr("Succeeded"),
277+
},
278+
}, nil).AnyTimes()
279+
265280
err := az.CreateOrUpdateLB(&v1.Service{}, network.LoadBalancer{
266281
Name: to.StringPtr("lb"),
267282
Etag: to.StringPtr("etag"),

0 commit comments

Comments
 (0)