Skip to content

Commit d1e8702

Browse files
authored
Merge pull request kubernetes#85201 from fabriziopandini/add-retry-to-etcd
kubeadm: add retry to etcd calls
2 parents 3b440df + 0573a22 commit d1e8702

File tree

1 file changed

+60
-20
lines changed

1 file changed

+60
-20
lines changed

cmd/kubeadm/app/util/etcd/etcd.go

Lines changed: 60 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -38,10 +38,11 @@ import (
3838
"k8s.io/kubernetes/cmd/kubeadm/app/util/config"
3939
)
4040

41-
// Exponential backoff for MemberAdd/Remove (values exclude jitter):
42-
// 0, 50, 150, 350, 750, 1550, 3150, 6350, 12750 ms
43-
var addRemoveBackoff = wait.Backoff{
44-
Steps: 8,
41+
const etcdTimeout = 2 * time.Second
42+
43+
// Exponential backoff for etcd operations
44+
var etcdBackoff = wait.Backoff{
45+
Steps: 9,
4546
Duration: 50 * time.Millisecond,
4647
Factor: 2.0,
4748
Jitter: 0.1,
@@ -146,11 +147,21 @@ func (c *Client) Sync() error {
146147
}
147148
defer cli.Close()
148149

149-
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
150-
err = cli.Sync(ctx)
151-
cancel()
150+
// Syncs the list of endpoints
151+
var lastError error
152+
err = wait.ExponentialBackoff(etcdBackoff, func() (bool, error) {
153+
ctx, cancel := context.WithTimeout(context.Background(), etcdTimeout)
154+
err = cli.Sync(ctx)
155+
cancel()
156+
if err == nil {
157+
return true, nil
158+
}
159+
klog.V(5).Infof("Failed to sync etcd endpoints: %v", err)
160+
lastError = err
161+
return false, nil
162+
})
152163
if err != nil {
153-
return err
164+
return lastError
154165
}
155166
klog.V(1).Infof("etcd endpoints read from etcd: %s", strings.Join(cli.Endpoints(), ","))
156167

@@ -180,11 +191,22 @@ func (c *Client) GetMemberID(peerURL string) (uint64, error) {
180191
}
181192
defer cli.Close()
182193

183-
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
184-
resp, err := cli.MemberList(ctx)
185-
cancel()
194+
// Gets the member list
195+
var lastError error
196+
var resp *clientv3.MemberListResponse
197+
err = wait.ExponentialBackoff(etcdBackoff, func() (bool, error) {
198+
ctx, cancel := context.WithTimeout(context.Background(), etcdTimeout)
199+
resp, err = cli.MemberList(ctx)
200+
cancel()
201+
if err == nil {
202+
return true, nil
203+
}
204+
klog.V(5).Infof("Failed to get etcd member list: %v", err)
205+
lastError = err
206+
return false, nil
207+
})
186208
if err != nil {
187-
return 0, err
209+
return 0, lastError
188210
}
189211

190212
for _, member := range resp.Members {
@@ -213,11 +235,14 @@ func (c *Client) RemoveMember(id uint64) ([]Member, error) {
213235
// Remove an existing member from the cluster
214236
var lastError error
215237
var resp *clientv3.MemberRemoveResponse
216-
err = wait.ExponentialBackoff(addRemoveBackoff, func() (bool, error) {
217-
resp, err = cli.MemberRemove(context.Background(), id)
238+
err = wait.ExponentialBackoff(etcdBackoff, func() (bool, error) {
239+
ctx, cancel := context.WithTimeout(context.Background(), etcdTimeout)
240+
resp, err = cli.MemberRemove(ctx, id)
241+
cancel()
218242
if err == nil {
219243
return true, nil
220244
}
245+
klog.V(5).Infof("Failed to remove etcd member: %v", err)
221246
lastError = err
222247
return false, nil
223248
})
@@ -260,11 +285,14 @@ func (c *Client) AddMember(name string, peerAddrs string) ([]Member, error) {
260285
// Adds a new member to the cluster
261286
var lastError error
262287
var resp *clientv3.MemberAddResponse
263-
err = wait.ExponentialBackoff(addRemoveBackoff, func() (bool, error) {
264-
resp, err = cli.MemberAdd(context.Background(), []string{peerAddrs})
288+
err = wait.ExponentialBackoff(etcdBackoff, func() (bool, error) {
289+
ctx, cancel := context.WithTimeout(context.Background(), etcdTimeout)
290+
resp, err = cli.MemberAdd(ctx, []string{peerAddrs})
291+
cancel()
265292
if err == nil {
266293
return true, nil
267294
}
295+
klog.V(5).Infof("Failed to add etcd member: %v", err)
268296
lastError = err
269297
return false, nil
270298
})
@@ -347,12 +375,24 @@ func (c *Client) getClusterStatus() (map[string]*clientv3.StatusResponse, error)
347375

348376
clusterStatus := make(map[string]*clientv3.StatusResponse)
349377
for _, ep := range c.Endpoints {
350-
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
351-
resp, err := cli.Status(ctx, ep)
352-
cancel()
378+
// Gets the member status
379+
var lastError error
380+
var resp *clientv3.StatusResponse
381+
err = wait.ExponentialBackoff(etcdBackoff, func() (bool, error) {
382+
ctx, cancel := context.WithTimeout(context.Background(), etcdTimeout)
383+
resp, err = cli.Status(ctx, ep)
384+
cancel()
385+
if err == nil {
386+
return true, nil
387+
}
388+
klog.V(5).Infof("Failed to get etcd status for %s: %v", ep, err)
389+
lastError = err
390+
return false, nil
391+
})
353392
if err != nil {
354-
return nil, err
393+
return nil, lastError
355394
}
395+
356396
clusterStatus[ep] = resp
357397
}
358398
return clusterStatus, nil

0 commit comments

Comments
 (0)