@@ -10,7 +10,7 @@ import (
1010 "github.com/sirupsen/logrus"
1111 resourcemanager "google.golang.org/api/cloudresourcemanager/v1"
1212 "google.golang.org/api/googleapi"
13- iam "google.golang.org/api/iam/v1"
13+ "google.golang.org/api/iam/v1"
1414 "google.golang.org/api/option"
1515 "k8s.io/apimachinery/pkg/util/wait"
1616
@@ -103,7 +103,8 @@ func CreateServiceAccount(ctx context.Context, infraID, projectID, role string)
103103// AddServiceAccountRoles adds predefined roles for service account.
104104func AddServiceAccountRoles (ctx context.Context , projectID , serviceAccountID string , roles []string ) error {
105105 // Get cloudresourcemanager service
106- ctx , cancel := context .WithTimeout (ctx , time .Minute * 1 )
106+ // The context timeout must be greater in time than the exponential backoff below
107+ ctx , cancel := context .WithTimeout (ctx , time .Minute * 2 )
107108 defer cancel ()
108109
109110 ssn , err := gcp .GetSession (ctx )
@@ -117,8 +118,9 @@ func AddServiceAccountRoles(ctx context.Context, projectID, serviceAccountID str
117118
118119 backoff := wait.Backoff {
119120 Duration : 2 * time .Second ,
121+ Factor : 2.0 ,
120122 Jitter : 1.0 ,
121- Steps : 5 ,
123+ Steps : retryCount ,
122124 }
123125 // Get and set the policy in a backoff loop.
124126 // If the policy set fails, the policy must be retrieved again via the get before retrying the set.
@@ -135,8 +137,7 @@ func AddServiceAccountRoles(ctx context.Context, projectID, serviceAccountID str
135137
136138 member := fmt .Sprintf ("serviceAccount:%s" , serviceAccountID )
137139 for _ , role := range roles {
138- err = addMemberToRole (policy , role , member )
139- if err != nil {
140+ if err := addMemberToRole (policy , role , member ); err != nil {
140141 return false , fmt .Errorf ("failed to add role %s to %s: %w" , role , member , err )
141142 }
142143 }
@@ -147,6 +148,15 @@ func AddServiceAccountRoles(ctx context.Context, projectID, serviceAccountID str
147148 lastErr = err
148149 logrus .Debugf ("Concurrent IAM policy changes, restarting read/modify/write" )
149150 return false , nil
151+ } else if isBadStatusError (err ) {
152+ // Documented here, https://cloud.google.com/iam/docs/retry-strategy, google
153+ // indicates that a service account may be created but not active for up to
154+ // 60 seconds. This behavior was causing a failure here when setting the policy
155+ // resulting in a 400 error from the API. If this error occurs retry with an
156+ // exponential backoff.
157+ lastErr = err
158+ logrus .Debugf ("bad request, unexpected error: %s" , err .Error ())
159+ return false , nil
150160 }
151161 return false , fmt .Errorf ("failed to set IAM policy, unexpected error: %w" , err )
152162 }
@@ -224,3 +234,8 @@ func isQuotaExceededError(err error) bool {
224234 }
225235 return false
226236}
237+
238+ func isBadStatusError (err error ) bool {
239+ var ae * googleapi.Error
240+ return errors .As (err , & ae ) && (ae .Code == http .StatusBadRequest )
241+ }
0 commit comments