Skip to content

Commit 66f4f62

Browse files
authored
Merge pull request #3493 from Guimove/fix-k8s-tls-retry-2668
driver/kubernetes: Add retry logic for transient TLS connection errors
2 parents 191d050 + e758a6b commit 66f4f62

File tree

1 file changed

+94
-4
lines changed

1 file changed

+94
-4
lines changed

driver/kubernetes/driver.go

Lines changed: 94 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,11 @@ package kubernetes
22

33
import (
44
"context"
5+
stderrors "errors"
56
"fmt"
67
"net"
78
"strings"
9+
"syscall"
810
"time"
911

1012
"github.com/docker/buildx/driver"
@@ -17,6 +19,7 @@ import (
1719
"github.com/docker/go-units"
1820
"github.com/moby/buildkit/client"
1921
"github.com/pkg/errors"
22+
"github.com/sirupsen/logrus"
2023
appsv1 "k8s.io/api/apps/v1"
2124
corev1 "k8s.io/api/core/v1"
2225
apierrors "k8s.io/apimachinery/pkg/api/errors"
@@ -212,11 +215,98 @@ func (d *Driver) Dial(ctx context.Context) (net.Conn, error) {
212215
}
213216
containerName := pod.Spec.Containers[0].Name
214217
cmd := []string{"buildctl", "dial-stdio"}
215-
conn, err := execconn.ExecConn(ctx, restClient, restClientConfig, pod.Namespace, pod.Name, containerName, cmd)
216-
if err != nil {
217-
return nil, err
218+
219+
// Retry connection with exponential backoff for transient errors
220+
// See https://github.com/docker/buildx/issues/2668
221+
var conn net.Conn
222+
err = tryWithBackoff(ctx, pod.Name, func() error {
223+
var err error
224+
conn, err = execconn.ExecConn(ctx, restClient, restClientConfig, pod.Namespace, pod.Name, containerName, cmd)
225+
return err
226+
})
227+
return conn, err
228+
}
229+
230+
// tryWithBackoff retries a function with exponential backoff for transient errors.
231+
// This handles the race condition where Kubernetes marks nodes as "Ready" before their
232+
// Certificate Signing Requests (CSRs) are approved, causing transient TLS errors.
233+
func tryWithBackoff(ctx context.Context, podName string, fn func() error) error {
234+
const (
235+
maxRetries = 5
236+
baseDelay = 500 * time.Millisecond
237+
maxDelay = 10 * time.Second
238+
)
239+
240+
var lastErr error
241+
for attempt := range maxRetries {
242+
err := fn()
243+
if err == nil {
244+
return nil
245+
}
246+
247+
lastErr = err
248+
249+
if !isTransientConnectionError(err) {
250+
return err
251+
}
252+
253+
if attempt < maxRetries-1 {
254+
delay := calculateBackoff(attempt, baseDelay, maxDelay)
255+
logrus.Warnf("Transient connection error to pod %s (attempt %d/%d): %v. Retrying in %v...",
256+
podName, attempt+1, maxRetries, err, delay)
257+
258+
select {
259+
case <-ctx.Done():
260+
return context.Cause(ctx)
261+
case <-time.After(delay):
262+
}
263+
}
264+
}
265+
266+
return errors.Wrapf(lastErr, "failed to connect to pod %s after %d attempts", podName, maxRetries)
267+
}
268+
269+
// isTransientConnectionError checks if an error is transient and should be retried.
270+
func isTransientConnectionError(err error) bool {
271+
if err == nil {
272+
return false
273+
}
274+
275+
// Check for context deadline exceeded
276+
if stderrors.Is(err, context.DeadlineExceeded) {
277+
return true
278+
}
279+
280+
// Check for closed network connection
281+
if stderrors.Is(err, net.ErrClosed) {
282+
return true
283+
}
284+
285+
// Check for timeout errors using net.Error interface
286+
var netErr net.Error
287+
if stderrors.As(err, &netErr) && netErr.Timeout() {
288+
return true
218289
}
219-
return conn, nil
290+
291+
// Check for syscall errors (connection refused, connection reset)
292+
var syscallErr syscall.Errno
293+
if stderrors.As(err, &syscallErr) {
294+
if syscallErr == syscall.ECONNREFUSED || syscallErr == syscall.ECONNRESET {
295+
return true
296+
}
297+
}
298+
299+
// TLS internal errors don't have a specific type, so we still need to check the message
300+
if strings.Contains(err.Error(), "tls: internal error") {
301+
return true
302+
}
303+
304+
return false
305+
}
306+
307+
// calculateBackoff calculates the delay for the given attempt with exponential backoff.
308+
func calculateBackoff(attempt int, baseDelay, maxDelay time.Duration) time.Duration {
309+
return min(time.Duration(1<<uint(attempt))*baseDelay, maxDelay)
220310
}
221311

222312
func (d *Driver) Client(ctx context.Context, opts ...client.ClientOpt) (*client.Client, error) {

0 commit comments

Comments
 (0)