Skip to content

Commit 8e548c3

Browse files
Merge pull request openshift#8763 from bfournie/retry-sa
OCPBUGS-37217: CAPI GCP - retry SetIAMPolicy
2 parents 4971b30 + 4a4f7fb commit 8e548c3

File tree

1 file changed

+64
-24
lines changed
  • pkg/infrastructure/gcp/clusterapi

1 file changed

+64
-24
lines changed

pkg/infrastructure/gcp/clusterapi/iam.go

Lines changed: 64 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,17 @@ package clusterapi
22

33
import (
44
"context"
5+
"errors"
56
"fmt"
7+
"net/http"
68
"time"
79

810
"github.com/sirupsen/logrus"
911
resourcemanager "google.golang.org/api/cloudresourcemanager/v1"
12+
"google.golang.org/api/googleapi"
1013
iam "google.golang.org/api/iam/v1"
1114
"google.golang.org/api/option"
15+
"k8s.io/apimachinery/pkg/util/wait"
1216

1317
gcp "github.com/openshift/installer/pkg/asset/installconfig/gcp"
1418
)
@@ -102,48 +106,66 @@ func AddServiceAccountRoles(ctx context.Context, projectID, serviceAccountID str
102106
return fmt.Errorf("failed to create resourcemanager service: %w", err)
103107
}
104108

105-
policy, err := getPolicy(ctx, service, projectID)
106-
if err != nil {
107-
return err
108-
}
109+
backoff := wait.Backoff{
110+
Duration: 2 * time.Second,
111+
Jitter: 1.0,
112+
Steps: 5,
113+
}
114+
// Get and set the policy in a backoff loop.
115+
// If the policy set fails, the policy must be retrieved again via the get before retrying the set.
116+
var lastErr error
117+
if waitErr := wait.ExponentialBackoffWithContext(ctx, backoff, func(ctx context.Context) (bool, error) {
118+
policy, err := getPolicy(ctx, service, projectID)
119+
if isQuotaExceededError(err) {
120+
lastErr = err
121+
logrus.Debugf("Failed to get IAM policy, retrying after backoff")
122+
return false, nil
123+
} else if err != nil {
124+
return false, fmt.Errorf("failed to get IAM policy, unexpected error: %w", err)
125+
}
109126

110-
member := fmt.Sprintf("serviceAccount:%s", serviceAccountID)
111-
for _, role := range roles {
112-
err = addMemberToRole(policy, role, member)
113-
if err != nil {
114-
return fmt.Errorf("failed to add role %s to %s: %w", role, member, err)
127+
member := fmt.Sprintf("serviceAccount:%s", serviceAccountID)
128+
for _, role := range roles {
129+
err = addMemberToRole(policy, role, member)
130+
if err != nil {
131+
return false, fmt.Errorf("failed to add role %s to %s: %w", role, member, err)
132+
}
115133
}
116-
}
117134

118-
err = setPolicy(ctx, service, projectID, policy)
119-
if err != nil {
120-
return err
135+
err = setPolicy(ctx, service, projectID, policy)
136+
if err != nil {
137+
if isConflictError(err) {
138+
lastErr = err
139+
logrus.Debugf("Concurrent IAM policy changes, restarting read/modify/write")
140+
return false, nil
141+
}
142+
return false, fmt.Errorf("failed to set IAM policy, unexpected error: %w", err)
143+
}
144+
logrus.Debugf("Successfully set IAM policy")
145+
return true, nil
146+
}); waitErr != nil {
147+
if wait.Interrupted(waitErr) {
148+
return fmt.Errorf("failed to set IAM policy: %w", lastErr)
149+
}
150+
return waitErr
121151
}
122-
123152
return nil
124153
}
125154

126155
// getPolicy gets the project's IAM policy.
127156
func getPolicy(ctx context.Context, crmService *resourcemanager.Service, projectID string) (*resourcemanager.Policy, error) {
157+
logrus.Debugf("Getting policy for %s", projectID)
128158
request := &resourcemanager.GetIamPolicyRequest{}
129159
policy, err := crmService.Projects.GetIamPolicy(projectID, request).Context(ctx).Do()
130-
if err != nil {
131-
return nil, fmt.Errorf("failed to fetch project IAM policy: %w", err)
132-
}
133-
134-
return policy, nil
160+
return policy, err
135161
}
136162

137163
// setPolicy sets the project's IAM policy.
138164
func setPolicy(ctx context.Context, crmService *resourcemanager.Service, projectID string, policy *resourcemanager.Policy) error {
139165
request := &resourcemanager.SetIamPolicyRequest{}
140166
request.Policy = policy
141167
_, err := crmService.Projects.SetIamPolicy(projectID, request).Context(ctx).Do()
142-
if err != nil {
143-
return fmt.Errorf("failed to set project IAM policy: %w", err)
144-
}
145-
146-
return nil
168+
return err
147169
}
148170

149171
// addMemberToRole adds a member to a role binding.
@@ -175,3 +197,21 @@ func addMemberToRole(policy *resourcemanager.Policy, role, member string) error
175197
logrus.Debugf("adding %s role, added %s member", role, member)
176198
return nil
177199
}
200+
201+
// isConflictError returns true if error matches conflict on concurrent policy sets.
202+
func isConflictError(err error) bool {
203+
var ae *googleapi.Error
204+
if errors.As(err, &ae) && (ae.Code == http.StatusConflict || ae.Code == http.StatusPreconditionFailed) {
205+
return true
206+
}
207+
return false
208+
}
209+
210+
// isQuotaExceededError returns true if the error matches quota exceeded.
211+
func isQuotaExceededError(err error) bool {
212+
var ae *googleapi.Error
213+
if errors.As(err, &ae) && (ae.Code == http.StatusTooManyRequests) {
214+
return true
215+
}
216+
return false
217+
}

0 commit comments

Comments
 (0)