Skip to content

Commit ad61345

Browse files
committed
OCPBUGS-45280: Allow more time for Service Account Creation
** GCP Service accounts were created correctly (and they could be found in the api), but Google states (https://cloud.google.com/iam/docs/retry-strategy) that it could take up to 60 seconds before the service account is able to be used. The exponential backoff is adjusted to actually increase in time each iteration. The context was also adjusted to ensure that we give the backoff enough time to finish. Google sends a 400 Error when the service account policies could NOT be applied, if this happens DURING the policy setting attempt again as long as we have time left in the context.
1 parent 2ea001d commit ad61345

File tree

1 file changed

+20
-5
lines changed
  • pkg/infrastructure/gcp/clusterapi

1 file changed

+20
-5
lines changed

pkg/infrastructure/gcp/clusterapi/iam.go

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ import (
1010
"github.com/sirupsen/logrus"
1111
resourcemanager "google.golang.org/api/cloudresourcemanager/v1"
1212
"google.golang.org/api/googleapi"
13-
iam "google.golang.org/api/iam/v1"
13+
"google.golang.org/api/iam/v1"
1414
"google.golang.org/api/option"
1515
"k8s.io/apimachinery/pkg/util/wait"
1616

@@ -103,7 +103,8 @@ func CreateServiceAccount(ctx context.Context, infraID, projectID, role string)
103103
// AddServiceAccountRoles adds predefined roles for service account.
104104
func AddServiceAccountRoles(ctx context.Context, projectID, serviceAccountID string, roles []string) error {
105105
// Get cloudresourcemanager service
106-
ctx, cancel := context.WithTimeout(ctx, time.Minute*1)
106+
// The context timeout must be greater in time than the exponential backoff below
107+
ctx, cancel := context.WithTimeout(ctx, time.Minute*2)
107108
defer cancel()
108109

109110
ssn, err := gcp.GetSession(ctx)
@@ -117,8 +118,9 @@ func AddServiceAccountRoles(ctx context.Context, projectID, serviceAccountID str
117118

118119
backoff := wait.Backoff{
119120
Duration: 2 * time.Second,
121+
Factor: 2.0,
120122
Jitter: 1.0,
121-
Steps: 5,
123+
Steps: retryCount,
122124
}
123125
// Get and set the policy in a backoff loop.
124126
// If the policy set fails, the policy must be retrieved again via the get before retrying the set.
@@ -135,8 +137,7 @@ func AddServiceAccountRoles(ctx context.Context, projectID, serviceAccountID str
135137

136138
member := fmt.Sprintf("serviceAccount:%s", serviceAccountID)
137139
for _, role := range roles {
138-
err = addMemberToRole(policy, role, member)
139-
if err != nil {
140+
if err := addMemberToRole(policy, role, member); err != nil {
140141
return false, fmt.Errorf("failed to add role %s to %s: %w", role, member, err)
141142
}
142143
}
@@ -147,6 +148,15 @@ func AddServiceAccountRoles(ctx context.Context, projectID, serviceAccountID str
147148
lastErr = err
148149
logrus.Debugf("Concurrent IAM policy changes, restarting read/modify/write")
149150
return false, nil
151+
} else if isBadStatusError(err) {
152+
// Documented here, https://cloud.google.com/iam/docs/retry-strategy, google
153+
// indicates that a service account may be created but not active for up to
154+
// 60 seconds. This behavior was causing a failure here when setting the policy
155+
// resulting in a 400 error from the API. If this error occurs retry with an
156+
// exponential backoff.
157+
lastErr = err
158+
logrus.Debugf("bad request, unexpected error: %s", err.Error())
159+
return false, nil
150160
}
151161
return false, fmt.Errorf("failed to set IAM policy, unexpected error: %w", err)
152162
}
@@ -224,3 +234,8 @@ func isQuotaExceededError(err error) bool {
224234
}
225235
return false
226236
}
237+
238+
func isBadStatusError(err error) bool {
239+
var ae *googleapi.Error
240+
return errors.As(err, &ae) && (ae.Code == http.StatusBadRequest)
241+
}

0 commit comments

Comments
 (0)