Skip to content

Commit 2fb49ca

Browse files
committed
chore: add Cloud Build build retry in deploy helper
1 parent 69b9f34 commit 2fb49ca

File tree

9 files changed

+462
-59
lines changed

9 files changed

+462
-59
lines changed

helpers/foundation-deployer/gcp/gcp.go

Lines changed: 123 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -15,15 +15,22 @@
1515
package gcp
1616

1717
import (
18+
"context"
19+
"encoding/json"
1820
"fmt"
21+
"regexp"
1922
"strings"
2023
"time"
2124

2225
"github.com/GoogleCloudPlatform/cloud-foundation-toolkit/infra/blueprint-test/pkg/gcloud"
26+
"github.com/GoogleCloudPlatform/cloud-foundation-toolkit/infra/blueprint-test/pkg/utils"
2327
"github.com/mitchellh/go-testing-interface"
2428
"github.com/tidwall/gjson"
2529

2630
"github.com/terraform-google-modules/terraform-example-foundation/test/integration/testutils"
31+
32+
"google.golang.org/api/cloudbuild/v1"
33+
"google.golang.org/api/option"
2734
)
2835

2936
const (
@@ -34,16 +41,73 @@ const (
3441
StatusCancelled = "CANCELLED"
3542
)
3643

44+
type RetryOp struct {
45+
Type string `json:"@type"`
46+
Build Build `json:"build"`
47+
}
48+
type Build struct {
49+
ID string `json:"id"`
50+
Status string `json:"status"`
51+
CreateTime string `json:"createTime"`
52+
}
53+
54+
var (
55+
retryRegexp = map[*regexp.Regexp]string{}
56+
ctx = context.Background()
57+
)
58+
59+
func init() {
60+
if len(retryRegexp) == 0 {
61+
for e, m := range testutils.RetryableTransientErrors {
62+
r, err := regexp.Compile(fmt.Sprintf("(?s)%s", e)) //(?s) enables dot (.) to match newline.
63+
if err != nil {
64+
fmt.Printf("failed to compile regex %s: %s", e, err.Error())
65+
}
66+
retryRegexp[r] = m
67+
}
68+
}
69+
}
70+
3771
type GCP struct {
38-
Runf func(t testing.TB, cmd string, args ...interface{}) gjson.Result
39-
sleepTime time.Duration
72+
Runf func(t testing.TB, cmd string, args ...interface{}) gjson.Result
73+
RunCmd func(t testing.TB, cmd string, args ...interface{}) string
74+
TriggerNewBuild func(t testing.TB, buildName string) (string, error)
75+
sleepTime time.Duration
76+
}
77+
78+
// runCmd is a wrapper around gcloud.RunCmd because the original function has an input with a private type
79+
func runCmd(t testing.TB, cmd string, args ...interface{}) string {
80+
return gcloud.RunCmd(t, utils.StringFromTextAndArgs(append([]interface{}{cmd}, args...)...))
81+
}
82+
83+
// triggerNewBuild triggers a new build based on the build provided
84+
func triggerNewBuild(t testing.TB, buildName string) (string, error) {
85+
86+
buildService, err := cloudbuild.NewService(ctx, option.WithScopes(cloudbuild.CloudPlatformScope))
87+
if err != nil {
88+
return "", fmt.Errorf("failed to create Cloud Build service: %w", err)
89+
}
90+
retryOperation, err := buildService.Projects.Locations.Builds.Retry(buildName, &cloudbuild.RetryBuildRequest{}).Do()
91+
if err != nil {
92+
return "", fmt.Errorf("failed to retry build: %w", err)
93+
}
94+
95+
var data RetryOp
96+
err = json.Unmarshal(retryOperation.Metadata, &data)
97+
if err != nil {
98+
return "", fmt.Errorf("Error unmarshaling retry operation metadata: %v", err)
99+
}
100+
101+
return data.Build.ID, nil
40102
}
41103

42104
// NewGCP creates a new wrapper for Google Cloud Platform CLI.
43105
func NewGCP() GCP {
44106
return GCP{
45-
Runf: gcloud.Runf,
46-
sleepTime: 20,
107+
Runf: gcloud.Runf,
108+
RunCmd: runCmd,
109+
TriggerNewBuild: triggerNewBuild,
110+
sleepTime: 20,
47111
}
48112
}
49113

@@ -70,8 +134,9 @@ func (g GCP) GetBuilds(t testing.TB, projectID, region, filter string) map[strin
70134
}
71135

72136
// GetLastBuildStatus gets the status of the last build form a project and region that satisfy the given filter.
73-
func (g GCP) GetLastBuildStatus(t testing.TB, projectID, region, filter string) string {
74-
return g.Runf(t, "builds list --project %s --region %s --limit 1 --sort-by ~createTime --filter %s", projectID, region, filter).Array()[0].Get("status").String()
137+
func (g GCP) GetLastBuildStatus(t testing.TB, projectID, region, filter string) (string, string) {
138+
build := g.Runf(t, "builds list --project %s --region %s --limit 1 --sort-by ~createTime --filter %s", projectID, region, filter).Array()[0]
139+
return build.Get("status").String(), build.Get("id").String()
75140
}
76141

77142
// GetBuildStatus gets the status of the given build
@@ -91,16 +156,21 @@ func (g GCP) GetRunningBuildID(t testing.TB, projectID, region, filter string) s
91156
return ""
92157
}
93158

159+
// GetBuildLogs get the execution logs of the given build
160+
func (g GCP) GetBuildLogs(t testing.TB, projectID, region, buildID string) string {
161+
return g.RunCmd(t, "builds log %s --project %s --region %s", buildID, projectID, region)
162+
}
163+
94164
// GetFinalBuildState gets the terminal status of the given build. It will wait if build is not finished.
95-
func (g GCP) GetFinalBuildState(t testing.TB, projectID, region, buildID string, maxRetry int) (string, error) {
165+
func (g GCP) GetFinalBuildState(t testing.TB, projectID, region, buildID string, maxBuildRetry int) (string, error) {
96166
var status string
97167
count := 0
98168
fmt.Printf("waiting for build %s execution.\n", buildID)
99169
status = g.GetBuildStatus(t, projectID, region, buildID)
100170
fmt.Printf("build status is %s\n", status)
101171
for status != StatusSuccess && status != StatusFailure && status != StatusCancelled {
102172
fmt.Printf("build status is %s\n", status)
103-
if count >= maxRetry {
173+
if count >= maxBuildRetry {
104174
return "", fmt.Errorf("timeout waiting for build '%s' execution", buildID)
105175
}
106176
count = count + 1
@@ -112,29 +182,61 @@ func (g GCP) GetFinalBuildState(t testing.TB, projectID, region, buildID string,
112182
}
113183

114184
// WaitBuildSuccess waits for the current build in a repo to finish.
115-
func (g GCP) WaitBuildSuccess(t testing.TB, project, region, repo, commitSha, failureMsg string, maxRetry int) error {
116-
var filter string
185+
func (g GCP) WaitBuildSuccess(t testing.TB, project, region, repo, commitSha, failureMsg string, maxBuildRetry, maxErrorRetries int, timeBetweenErrorRetries time.Duration) error {
186+
var filter, status, build string
187+
var timeoutErr, err error
188+
117189
if commitSha == "" {
118190
filter = fmt.Sprintf("source.repoSource.repoName:%s", repo)
119191
} else {
120192
filter = fmt.Sprintf("source.repoSource.commitSha:%s", commitSha)
121193
}
122-
build := g.GetRunningBuildID(t, project, region, filter)
123-
if build != "" {
124-
status, err := g.GetFinalBuildState(t, project, region, build, maxRetry)
194+
195+
build = g.GetRunningBuildID(t, project, region, filter)
196+
for i := 0; i < maxErrorRetries; i++ {
197+
if build != "" {
198+
status, timeoutErr = g.GetFinalBuildState(t, project, region, build, maxBuildRetry)
199+
} else {
200+
status, build = g.GetLastBuildStatus(t, project, region, filter)
201+
}
202+
203+
if timeoutErr != nil {
204+
return timeoutErr
205+
} else if status != StatusSuccess {
206+
if !g.IsRetryableError(t, project, region, build) {
207+
return fmt.Errorf("%s\nSee:\nhttps://console.cloud.google.com/cloud-build/builds;region=%s/%s?project=%s\nfor details", failureMsg, region, build, project)
208+
}
209+
fmt.Println("build failed with retryable error. a new build will be triggered.")
210+
} else {
211+
return nil // Build succeeded
212+
}
213+
214+
// Trigger a new build
215+
build, err = g.TriggerNewBuild(t, fmt.Sprintf("projects/%s/locations/%s/builds/%s", project, region, build))
125216
if err != nil {
126-
return err
217+
return fmt.Errorf("failed to trigger new build after %d retries: %w", maxErrorRetries, err)
127218
}
128-
if status != StatusSuccess {
129-
return fmt.Errorf("%s\nSee:\nhttps://console.cloud.google.com/cloud-build/builds;region=%s/%s?project=%s\nfor details", failureMsg, region, build, project)
219+
fmt.Printf("triggered new build with ID: %s (attempt %d/%d)\n", build, i+1, maxErrorRetries)
220+
if i < maxErrorRetries-1 {
221+
time.Sleep(timeBetweenErrorRetries) // Wait before retrying
130222
}
131-
} else {
132-
status := g.GetLastBuildStatus(t, project, region, filter)
133-
if status != StatusSuccess {
134-
return fmt.Errorf("%s\nSee:\nhttps://console.cloud.google.com/cloud-build/builds;region=%s/%s?project=%s\nfor details", failureMsg, region, build, project)
223+
}
224+
return fmt.Errorf("%s\nbuild failed after %d retries.\nSee Cloud Build logs for details.", failureMsg, maxErrorRetries)
225+
}
226+
227+
// IsRetryableError checks the logs of a failed Cloud Build build
228+
// and verify if the error is a transient one and can be retried
229+
func (g GCP) IsRetryableError(t testing.TB, projectID, region, build string) bool {
230+
logs := g.GetBuildLogs(t, projectID, region, build)
231+
found := false
232+
for pattern, msg := range retryRegexp {
233+
if pattern.MatchString(logs) {
234+
found = true
235+
fmt.Printf("error '%s' is worth of a retry\n", msg)
236+
break
135237
}
136238
}
137-
return nil
239+
return found
138240
}
139241

140242
// HasSccNotification checks if a Security Command Center notification exists

helpers/foundation-deployer/gcp/gcp_test.go

Lines changed: 65 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,13 @@ import (
1818
"fmt"
1919
"os"
2020
"path/filepath"
21+
"time"
22+
2123
gotest "testing"
2224

2325
"github.com/mitchellh/go-testing-interface"
2426
"github.com/stretchr/testify/assert"
27+
2528
"github.com/tidwall/gjson"
2629
)
2730

@@ -71,12 +74,12 @@ func TestGetLastBuildStatus(t *gotest.T) {
7174
},
7275
sleepTime: 1,
7376
}
74-
status := gcp.GetLastBuildStatus(t, "prj-b-cicd-0123", "us-central1", "filter")
77+
status, _ := gcp.GetLastBuildStatus(t, "prj-b-cicd-0123", "us-central1", "filter")
7578
assert.Equal(t, StatusSuccess, status)
7679

7780
current, err = os.ReadFile(filepath.Join(".", "testdata", "failure_build.json"))
7881
assert.NoError(t, err)
79-
status = gcp.GetLastBuildStatus(t, "prj-b-cicd-0123", "us-central1", "filter")
82+
status, _ = gcp.GetLastBuildStatus(t, "prj-b-cicd-0123", "us-central1", "filter")
8083
assert.Equal(t, StatusFailure, status)
8184
}
8285

@@ -132,10 +135,13 @@ func TestWaitBuildSuccess(t *gotest.T) {
132135
callCount = callCount + 1
133136
return resp
134137
},
138+
RunCmd: func(t testing.TB, cmd string, args ...interface{}) string {
139+
return ""
140+
},
135141
sleepTime: 1,
136142
}
137143

138-
err = gcp.WaitBuildSuccess(t, "prj-b-cicd-0123", "us-central1", "repo", "", "failed_test_for_WaitBuildSuccess", 40)
144+
err = gcp.WaitBuildSuccess(t, "prj-b-cicd-0123", "us-central1", "repo", "", "failed_test_for_WaitBuildSuccess", 40, 2, 1*time.Second)
139145
assert.Error(t, err, "should have failed")
140146
assert.Contains(t, err.Error(), "failed_test_for_WaitBuildSuccess", "should have failed with custom info")
141147
assert.Equal(t, callCount, 3, "Runf must be called three times")
@@ -164,11 +170,66 @@ func TestWaitBuildTimeout(t *gotest.T) {
164170
callCount = callCount + 1
165171
return resp
166172
},
173+
RunCmd: func(t testing.TB, cmd string, args ...interface{}) string {
174+
return ""
175+
},
167176
sleepTime: 1,
168177
}
169178

170-
err = gcp.WaitBuildSuccess(t, "prj-b-cicd-0123", "us-central1", "repo", "", "failed_test_for_WaitBuildSuccess", 1)
179+
err = gcp.WaitBuildSuccess(t, "prj-b-cicd-0123", "us-central1", "repo", "", "failed_test_for_WaitBuildSuccess", 1, 1, 1*time.Second)
171180
assert.Error(t, err, "should have failed")
172181
assert.Contains(t, err.Error(), "timeout waiting for build '736f4689-2497-4382-afd0-b5f0f50eea5b' execution", "should have failed with timeout error")
173182
assert.Equal(t, callCount, 3, "Runf must be called three times")
174183
}
184+
185+
func TestWaitBuildSuccessRetry(t *gotest.T) {
186+
187+
working, err := os.ReadFile(filepath.Join(".", "testdata", "working_build.json"))
188+
assert.NoError(t, err)
189+
failure, err := os.ReadFile(filepath.Join(".", "testdata", "failure_build.json"))
190+
assert.NoError(t, err)
191+
retry, err := os.ReadFile(filepath.Join(".", "testdata", "working_build_retry.json"))
192+
assert.NoError(t, err)
193+
success, err := os.ReadFile(filepath.Join(".", "testdata", "success_build.json"))
194+
assert.NoError(t, err)
195+
196+
runCmdCallCount := 0
197+
triggerNewBuildCallCount := 0
198+
runfCallCount := 0
199+
runfCalls := []gjson.Result{
200+
{Type: gjson.JSON,
201+
Raw: fmt.Sprintf("[%s]", string(working[:]))}, // builds list
202+
{Type: gjson.JSON,
203+
Raw: string(working[:])}, // builds describe
204+
{Type: gjson.JSON,
205+
Raw: string(failure[:])}, // builds describe
206+
{Type: gjson.JSON,
207+
Raw: string(retry[:])}, // builds describe
208+
{Type: gjson.JSON,
209+
Raw: string(success[:])}, // builds describe
210+
}
211+
212+
gcp := GCP{
213+
Runf: func(t testing.TB, cmd string, args ...interface{}) gjson.Result {
214+
resp := runfCalls[runfCallCount]
215+
runfCallCount = runfCallCount + 1
216+
return resp
217+
},
218+
RunCmd: func(t testing.TB, cmd string, args ...interface{}) string {
219+
runCmdCallCount = runCmdCallCount + 1
220+
return "a\nError 403. Compute Engine API has not been used in project\nz" // get build logs
221+
},
222+
TriggerNewBuild: func(t testing.TB, buildName string) (string, error) {
223+
triggerNewBuildCallCount = triggerNewBuildCallCount + 1
224+
return "845f5790-2497-4382-afd0-b5f0f50eea5a", nil // buildService.Projects.Locations.Builds.Retry
225+
},
226+
sleepTime: 1,
227+
}
228+
229+
err = gcp.WaitBuildSuccess(t, "prj-b-cicd-0123", "us-central1", "repo", "", "", 40, 2, 1*time.Second)
230+
231+
assert.Nil(t, err, "should have succeeded")
232+
assert.Equal(t, runfCallCount, 5, "Runf must be called five times")
233+
assert.Equal(t, runCmdCallCount, 1, "runCmd getLogs must be called once")
234+
assert.Equal(t, triggerNewBuildCallCount, 1, "TriggerNewBuild must be called once")
235+
}

0 commit comments

Comments
 (0)