@@ -2,9 +2,11 @@ package kubernetes
22
33import (
44 "context"
5+ stderrors "errors"
56 "fmt"
67 "net"
78 "strings"
9+ "syscall"
810 "time"
911
1012 "github.com/docker/buildx/driver"
@@ -17,6 +19,7 @@ import (
1719 "github.com/docker/go-units"
1820 "github.com/moby/buildkit/client"
1921 "github.com/pkg/errors"
22+ "github.com/sirupsen/logrus"
2023 appsv1 "k8s.io/api/apps/v1"
2124 corev1 "k8s.io/api/core/v1"
2225 apierrors "k8s.io/apimachinery/pkg/api/errors"
@@ -212,11 +215,98 @@ func (d *Driver) Dial(ctx context.Context) (net.Conn, error) {
212215 }
213216 containerName := pod .Spec .Containers [0 ].Name
214217 cmd := []string {"buildctl" , "dial-stdio" }
215- conn , err := execconn .ExecConn (ctx , restClient , restClientConfig , pod .Namespace , pod .Name , containerName , cmd )
216- if err != nil {
217- return nil , err
218+
219+ // Retry connection with exponential backoff for transient errors
220+ // See https://github.com/docker/buildx/issues/2668
221+ var conn net.Conn
222+ err = tryWithBackoff (ctx , pod .Name , func () error {
223+ var err error
224+ conn , err = execconn .ExecConn (ctx , restClient , restClientConfig , pod .Namespace , pod .Name , containerName , cmd )
225+ return err
226+ })
227+ return conn , err
228+ }
229+
230+ // tryWithBackoff retries a function with exponential backoff for transient errors.
231+ // This handles the race condition where Kubernetes marks nodes as "Ready" before their
232+ // Certificate Signing Requests (CSRs) are approved, causing transient TLS errors.
233+ func tryWithBackoff (ctx context.Context , podName string , fn func () error ) error {
234+ const (
235+ maxRetries = 5
236+ baseDelay = 500 * time .Millisecond
237+ maxDelay = 10 * time .Second
238+ )
239+
240+ var lastErr error
241+ for attempt := range maxRetries {
242+ err := fn ()
243+ if err == nil {
244+ return nil
245+ }
246+
247+ lastErr = err
248+
249+ if ! isTransientConnectionError (err ) {
250+ return err
251+ }
252+
253+ if attempt < maxRetries - 1 {
254+ delay := calculateBackoff (attempt , baseDelay , maxDelay )
255+ logrus .Warnf ("Transient connection error to pod %s (attempt %d/%d): %v. Retrying in %v..." ,
256+ podName , attempt + 1 , maxRetries , err , delay )
257+
258+ select {
259+ case <- ctx .Done ():
260+ return context .Cause (ctx )
261+ case <- time .After (delay ):
262+ }
263+ }
264+ }
265+
266+ return errors .Wrapf (lastErr , "failed to connect to pod %s after %d attempts" , podName , maxRetries )
267+ }
268+
269+ // isTransientConnectionError checks if an error is transient and should be retried.
270+ func isTransientConnectionError (err error ) bool {
271+ if err == nil {
272+ return false
273+ }
274+
275+ // Check for context deadline exceeded
276+ if stderrors .Is (err , context .DeadlineExceeded ) {
277+ return true
278+ }
279+
280+ // Check for closed network connection
281+ if stderrors .Is (err , net .ErrClosed ) {
282+ return true
283+ }
284+
285+ // Check for timeout errors using net.Error interface
286+ var netErr net.Error
287+ if stderrors .As (err , & netErr ) && netErr .Timeout () {
288+ return true
218289 }
219- return conn , nil
290+
291+ // Check for syscall errors (connection refused, connection reset)
292+ var syscallErr syscall.Errno
293+ if stderrors .As (err , & syscallErr ) {
294+ if syscallErr == syscall .ECONNREFUSED || syscallErr == syscall .ECONNRESET {
295+ return true
296+ }
297+ }
298+
299+ // TLS internal errors don't have a specific type, so we still need to check the message
300+ if strings .Contains (err .Error (), "tls: internal error" ) {
301+ return true
302+ }
303+
304+ return false
305+ }
306+
307+ // calculateBackoff calculates the delay for the given attempt with exponential backoff.
308+ func calculateBackoff (attempt int , baseDelay , maxDelay time.Duration ) time.Duration {
309+ return min (time .Duration (1 << uint (attempt ))* baseDelay , maxDelay )
220310}
221311
222312func (d * Driver ) Client (ctx context.Context , opts ... client.ClientOpt ) (* client.Client , error ) {
0 commit comments