Skip to content

Commit 1958a83

Browse files
committed
[image-builder] Introduce retry for all resolver requests (timeout 15s, up to 3 times) for all network-related issues (feature flag: imagebuilder_retry_resolve)
1 parent 7d1a8eb commit 1958a83

File tree

8 files changed

+306
-8
lines changed

8 files changed

+306
-8
lines changed

components/image-builder-mk3/cmd/run.go

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
"time"
1010

1111
"github.com/gitpod-io/gitpod/common-go/baseserver"
12+
"github.com/gitpod-io/gitpod/common-go/experiments"
1213

1314
"github.com/gitpod-io/gitpod/common-go/log"
1415
"github.com/gitpod-io/gitpod/image-builder/api"
@@ -40,7 +41,12 @@ var runCmd = &cobra.Command{
4041
span, ctx := opentracing.StartSpanFromContext(ctx, "/cmd/Run")
4142
defer span.Finish()
4243

43-
service, err := orchestrator.NewOrchestratingBuilder(cfg.Orchestrator)
44+
// Check feature flag for retry client
45+
experimentsClient := experiments.NewClient()
46+
useRetryClient := experimentsClient.GetBoolValue(ctx, "imagebuilder_retry_resolve", false, experiments.Attributes{})
47+
log.WithField("useRetryClient", useRetryClient).Info("Feature flag imagebuilder_retry_resolve checked")
48+
49+
service, err := orchestrator.NewOrchestratingBuilder(cfg.Orchestrator, useRetryClient)
4450
if err != nil {
4551
log.Fatal(err)
4652
}

components/image-builder-mk3/go.mod

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,8 @@ require (
9393

9494
require (
9595
github.com/Microsoft/hcsshim v0.11.4 // indirect
96+
github.com/blang/semver v3.5.1+incompatible // indirect
97+
github.com/configcat/go-sdk/v7 v7.6.0 // indirect
9698
github.com/containerd/errdefs v0.1.0 // indirect
9799
github.com/containerd/log v0.1.0 // indirect
98100
github.com/jmespath/go-jmespath v0.4.0 // indirect

components/image-builder-mk3/go.sum

Lines changed: 10 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

components/image-builder-mk3/pkg/orchestrator/orchestrator.go

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ const (
5656
)
5757

5858
// NewOrchestratingBuilder creates a new orchestrating image builder
59-
func NewOrchestratingBuilder(cfg config.Configuration) (res *Orchestrator, err error) {
59+
func NewOrchestratingBuilder(cfg config.Configuration, useRetryClient bool) (res *Orchestrator, err error) {
6060
var authentication auth.CompositeAuth
6161
if cfg.PullSecretFile != "" {
6262
fn := cfg.PullSecretFile
@@ -106,14 +106,26 @@ func NewOrchestratingBuilder(cfg config.Configuration) (res *Orchestrator, err e
106106
wsman = wsmanapi.NewWorkspaceManagerClient(conn)
107107
}
108108

109+
// Create RefResolver with conditional retry client based on feature flag
110+
var refResolver resolve.DockerRefResolver
111+
if useRetryClient {
112+
refResolver = &resolve.StandaloneRefResolver{
113+
Client: NewRetryTimeoutClient(),
114+
}
115+
log.Info("Using retry timeout client for Docker registry requests")
116+
} else {
117+
refResolver = &resolve.StandaloneRefResolver{}
118+
log.Info("Using standard client for Docker registry requests")
119+
}
120+
109121
o := &Orchestrator{
110122
Config: cfg,
111123
Auth: authentication,
112124
AuthResolver: auth.Resolver{
113125
BaseImageRepository: cfg.BaseImageRepository,
114126
WorkspaceImageRepository: cfg.WorkspaceImageRepository,
115127
},
116-
RefResolver: &resolve.StandaloneRefResolver{},
128+
RefResolver: refResolver,
117129

118130
wsman: wsman,
119131
buildListener: make(map[string]map[buildListener]struct{}),

components/image-builder-mk3/pkg/orchestrator/orchestrator_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,7 @@ func TestBuild(t *testing.T) {
138138
BaseImageRepository: "registry/base",
139139
WorkspaceImageRepository: "registry/workspace",
140140
BuilderImage: "builder-image",
141-
})
141+
}, false)
142142
if err != nil {
143143
t.Fatal(err)
144144
}
@@ -188,7 +188,7 @@ func TestResolveBaseImage(t *testing.T) {
188188
WorkspaceManager: config.WorkspaceManagerConfig{
189189
Client: wsmock.NewMockWorkspaceManagerClient(ctrl),
190190
},
191-
})
191+
}, false)
192192
if err != nil {
193193
t.Fatal(err)
194194
}
Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
// Copyright (c) 2021 Gitpod GmbH. All rights reserved.
2+
// Licensed under the GNU Affero General Public License (AGPL).
3+
// See License.AGPL.txt in the project root for license information.
4+
5+
package orchestrator
6+
7+
import (
8+
"context"
9+
"errors"
10+
"fmt"
11+
"net"
12+
"net/http"
13+
"strings"
14+
"time"
15+
16+
"github.com/gitpod-io/gitpod/common-go/log"
17+
)
18+
19+
const (
20+
// refResolverTimeout is the timeout for individual HTTP requests to Docker registries
21+
refResolverTimeout = 15 * time.Second
22+
23+
// refResolverMaxRetries is the maximum number of retry attempts for registry requests
24+
refResolverMaxRetries = 3
25+
)
26+
27+
// RetryRoundTripper implements http.RoundTripper with retry logic
28+
// for Docker registry requests. It retries only on timeouts and network connectivity issues.
29+
type RetryRoundTripper struct {
30+
base http.RoundTripper
31+
maxRetries int
32+
}
33+
34+
// NewRetryTimeoutClient creates a new HTTP client with timeout and retry capabilities
35+
func NewRetryTimeoutClient() *http.Client {
36+
retryTransport := &RetryRoundTripper{
37+
base: http.DefaultTransport,
38+
maxRetries: refResolverMaxRetries,
39+
}
40+
41+
return &http.Client{
42+
Timeout: refResolverTimeout,
43+
Transport: retryTransport,
44+
}
45+
}
46+
47+
// RoundTrip executes an HTTP request with retry logic for timeouts and network errors
48+
func (r *RetryRoundTripper) RoundTrip(req *http.Request) (*http.Response, error) {
49+
var lastResp *http.Response
50+
var lastErr error
51+
52+
// Use the existing retry function from monitor.go
53+
err := retry(req.Context(), func(ctx context.Context) error {
54+
// Clone the request for each retry attempt
55+
reqClone := req.Clone(ctx)
56+
resp, err := r.base.RoundTrip(reqClone)
57+
if err != nil {
58+
lastErr = err
59+
lastResp = nil
60+
return err
61+
}
62+
// Store the response for successful requests
63+
lastResp = resp
64+
lastErr = nil
65+
return nil
66+
}, r.shouldRetry, 1*time.Second, r.maxRetries)
67+
68+
if err != nil {
69+
if errors.Is(err, errOutOfRetries) {
70+
// Return the last error from the actual HTTP request
71+
if lastErr != nil {
72+
return nil, lastErr
73+
}
74+
return nil, fmt.Errorf("registry request failed after %d retries", r.maxRetries)
75+
}
76+
return nil, err
77+
}
78+
79+
// If we get here, the last attempt was successful
80+
return lastResp, nil
81+
}
82+
83+
// shouldRetry determines if an error should trigger a retry attempt
84+
// Only retries on timeouts and network connectivity issues
85+
func (r *RetryRoundTripper) shouldRetry(err error) bool {
86+
if err == nil {
87+
return false
88+
}
89+
90+
// Check for timeout errors
91+
if errors.Is(err, context.DeadlineExceeded) {
92+
log.WithError(err).Debug("retrying registry request due to timeout")
93+
return true
94+
}
95+
96+
// Check for network connectivity issues
97+
if r.isNetworkError(err) {
98+
log.WithError(err).Debug("retrying registry request due to network error")
99+
return true
100+
}
101+
102+
return false
103+
}
104+
105+
// isNetworkError checks if an error is a network connectivity issue
106+
func (r *RetryRoundTripper) isNetworkError(err error) bool {
107+
if err == nil {
108+
return false
109+
}
110+
111+
// Check for net.Error interface (includes timeout, temporary errors)
112+
if netErr, ok := err.(net.Error); ok {
113+
if netErr.Timeout() || netErr.Temporary() {
114+
return true
115+
}
116+
}
117+
118+
// Check for common network error strings
119+
errorStr := err.Error()
120+
return strings.Contains(errorStr, "connection refused") ||
121+
strings.Contains(errorStr, "no such host") ||
122+
strings.Contains(errorStr, "network is unreachable") ||
123+
strings.Contains(errorStr, "connection reset by peer") ||
124+
strings.Contains(errorStr, "i/o timeout") ||
125+
strings.Contains(errorStr, "net/http: TLS handshake timeout")
126+
}
Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
// Copyright (c) 2021 Gitpod GmbH. All rights reserved.
2+
// Licensed under the GNU Affero General Public License (AGPL).
3+
// See License.AGPL.txt in the project root for license information.
4+
5+
package orchestrator
6+
7+
import (
8+
"errors"
9+
"net"
10+
"net/http"
11+
"sync/atomic"
12+
"testing"
13+
)
14+
15+
// TestRetryRoundTripper tests the retry logic for network errors and timeouts
16+
func TestRetryRoundTripper(t *testing.T) {
17+
tests := []struct {
18+
name string
19+
errorType string
20+
expectedRetries int
21+
shouldSucceed bool
22+
}{
23+
{
24+
name: "timeout error should retry",
25+
errorType: "timeout",
26+
expectedRetries: 3,
27+
shouldSucceed: false,
28+
},
29+
{
30+
name: "connection refused should retry",
31+
errorType: "connection_refused",
32+
expectedRetries: 3,
33+
shouldSucceed: false,
34+
},
35+
{
36+
name: "network unreachable should retry",
37+
errorType: "network_unreachable",
38+
expectedRetries: 3,
39+
shouldSucceed: false,
40+
},
41+
{
42+
name: "success after retries",
43+
errorType: "success_after_retry",
44+
expectedRetries: 2,
45+
shouldSucceed: true,
46+
},
47+
}
48+
49+
for _, tt := range tests {
50+
t.Run(tt.name, func(t *testing.T) {
51+
var attemptCount int64
52+
53+
// Create a mock transport that simulates different error conditions
54+
mockTransport := &mockRoundTripper{
55+
errorType: tt.errorType,
56+
attemptCount: &attemptCount,
57+
maxAttempts: int64(tt.expectedRetries),
58+
}
59+
60+
retryTransport := &RetryRoundTripper{
61+
base: mockTransport,
62+
maxRetries: refResolverMaxRetries,
63+
}
64+
65+
req, err := http.NewRequest("GET", "http://example.com", nil)
66+
if err != nil {
67+
t.Fatal(err)
68+
}
69+
70+
_, err = retryTransport.RoundTrip(req)
71+
72+
if tt.shouldSucceed && err != nil {
73+
t.Errorf("Expected success but got error: %v", err)
74+
}
75+
if !tt.shouldSucceed && err == nil {
76+
t.Errorf("Expected error but got success")
77+
}
78+
79+
actualAttempts := atomic.LoadInt64(&attemptCount)
80+
if actualAttempts != int64(tt.expectedRetries) {
81+
t.Errorf("Expected %d attempts, got %d", tt.expectedRetries, actualAttempts)
82+
}
83+
})
84+
}
85+
}
86+
87+
// mockRoundTripper simulates different network error conditions for testing
88+
type mockRoundTripper struct {
89+
errorType string
90+
attemptCount *int64
91+
maxAttempts int64
92+
}
93+
94+
func (m *mockRoundTripper) RoundTrip(req *http.Request) (*http.Response, error) {
95+
attempt := atomic.AddInt64(m.attemptCount, 1)
96+
97+
switch m.errorType {
98+
case "timeout":
99+
return nil, &net.OpError{
100+
Op: "dial",
101+
Net: "tcp",
102+
Err: &timeoutError{},
103+
}
104+
case "connection_refused":
105+
return nil, errors.New("connection refused")
106+
case "network_unreachable":
107+
return nil, errors.New("network is unreachable")
108+
case "success_after_retry":
109+
if attempt < m.maxAttempts {
110+
return nil, errors.New("connection refused")
111+
}
112+
// Success on the last attempt
113+
return &http.Response{
114+
StatusCode: 200,
115+
Body: http.NoBody,
116+
}, nil
117+
default:
118+
return &http.Response{
119+
StatusCode: 200,
120+
Body: http.NoBody,
121+
}, nil
122+
}
123+
}
124+
125+
// timeoutError implements net.Error with Timeout() returning true
126+
type timeoutError struct{}
127+
128+
func (e *timeoutError) Error() string { return "timeout" }
129+
func (e *timeoutError) Timeout() bool { return true }
130+
func (e *timeoutError) Temporary() bool { return false }

0 commit comments

Comments
 (0)