Skip to content

Commit 1c430ff

Browse files
committed
kubeadm: fix flakes when performing etcd MemberAdd on slower setups
In slower setups it can take more time for the existing cluster to be in a healthy state, so the existing backoff of ~50 seconds is apparently not sufficient. The client dial can also fail for similar reasons. Improve kubeadm's join toleration of adding new etcd members. Wrap both the client dial and member add in a longer backoff (up to ~200 seconds). This particular change should be backported to the support skew. In a future change for master, all etcd client operations should be make consistent so that the etcd logic is in a sane state.
1 parent 8dd93ca commit 1c430ff

File tree

1 file changed

+21
-12
lines changed

1 file changed

+21
-12
lines changed

cmd/kubeadm/app/util/etcd/etcd.go

Lines changed: 21 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -351,23 +351,32 @@ func (c *Client) AddMember(name string, peerAddrs string) ([]Member, error) {
351351
return nil, errors.Wrapf(err, "error parsing peer address %s", peerAddrs)
352352
}
353353

354-
cli, err := clientv3.New(clientv3.Config{
355-
Endpoints: c.Endpoints,
356-
DialTimeout: dialTimeout,
357-
DialOptions: []grpc.DialOption{
358-
grpc.WithBlock(), // block until the underlying connection is up
359-
},
360-
TLS: c.TLS,
361-
})
362-
if err != nil {
363-
return nil, err
354+
// Exponential backoff for the MemberAdd operation (up to ~200 seconds)
355+
etcdBackoffAdd := wait.Backoff{
356+
Steps: 18,
357+
Duration: 100 * time.Millisecond,
358+
Factor: 1.5,
359+
Jitter: 0.1,
364360
}
365-
defer cli.Close()
366361

367362
// Adds a new member to the cluster
368363
var lastError error
369364
var resp *clientv3.MemberAddResponse
370-
err = wait.ExponentialBackoff(etcdBackoff, func() (bool, error) {
365+
err = wait.ExponentialBackoff(etcdBackoffAdd, func() (bool, error) {
366+
cli, err := clientv3.New(clientv3.Config{
367+
Endpoints: c.Endpoints,
368+
DialTimeout: etcdTimeout,
369+
DialOptions: []grpc.DialOption{
370+
grpc.WithBlock(), // block until the underlying connection is up
371+
},
372+
TLS: c.TLS,
373+
})
374+
if err != nil {
375+
lastError = err
376+
return false, nil
377+
}
378+
defer cli.Close()
379+
371380
ctx, cancel := context.WithTimeout(context.Background(), etcdTimeout)
372381
resp, err = cli.MemberAdd(ctx, []string{peerAddrs})
373382
cancel()

0 commit comments

Comments
 (0)