Skip to content

Commit 0bb0659

Browse files
ceachejeffwidman
authored andcommitted
fix(core): Implement proper retry backoff logic with jitter.
New retry logic takes a maximum percentage off the canonical backoff, ensure gradual predictable retries timings while still having a controlable amount of jitter (re-introducing the `max_jitter` parameter) to avoids swarming client retries. Fix regression introduced in 60366d2 where retry/backoff logic produced only whole second (integer) retry delays. This produced inadequate retries on first retry and would generally not work on fast network where sub miliseconds retries are desired. Additionally, with high `max_delay` setting, as the range was always spanning from 0 until the last delay, it would also produce extremely random results with short delays following longer ones which is contrary to the expected backoff logic.
1 parent c7e8050 commit 0bb0659

File tree

1 file changed

+17
-21
lines changed

1 file changed

+17
-21
lines changed

kazoo/retry.py

Lines changed: 17 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
import logging
22
import random
33
import time
4-
import warnings
54

65
from kazoo.exceptions import (
76
ConnectionClosedError,
@@ -43,19 +42,20 @@ class KazooRetry(object):
4342
SessionExpiredError,
4443
)
4544

46-
def __init__(self, max_tries=1, delay=0.1, backoff=2, max_jitter=None,
47-
max_delay=60, ignore_expire=True, sleep_func=time.sleep,
45+
def __init__(self, max_tries=1, delay=0.1, backoff=2, max_jitter=0.4,
46+
max_delay=60.0, ignore_expire=True, sleep_func=time.sleep,
4847
deadline=None, interrupt=None):
4948
"""Create a :class:`KazooRetry` instance for retrying function
50-
calls with uniform jitter
49+
calls.
5150
5251
:param max_tries: How many times to retry the command. -1 means
5352
infinite tries.
5453
:param delay: Initial delay between retry attempts.
5554
:param backoff: Backoff multiplier between retry attempts.
5655
Defaults to 2 for exponential backoff.
57-
:param max_jitter: *Deprecated* Jitter is now uniformly distributed
58-
across retries.
56+
:param max_jitter: Percentage of jitter to apply to each retry's delay
57+
to ensure all clients to do not hammer the server
58+
at the same time. Between 0.0 and 1.0.
5959
:param max_delay: Maximum delay in seconds, regardless of other
6060
backoff settings. Defaults to one minute.
6161
:param ignore_expire:
@@ -68,15 +68,11 @@ def __init__(self, max_tries=1, delay=0.1, backoff=2, max_jitter=None,
6868
between retries.
6969
7070
"""
71-
if max_jitter is not None:
72-
warnings.warn(
73-
'Passing max_jitter to retry configuration is deprecated.'
74-
' Retry jitter is now automacallity uniform across retries.'
75-
' The parameter will be ignored.',
76-
DeprecationWarning, stacklevel=2)
7771
self.max_tries = max_tries
7872
self.delay = delay
7973
self.backoff = backoff
74+
# Ensure max_jitter is in (0, 1)
75+
self.max_jitter = max(min(max_jitter, 1.0), 0.0)
8076
self.max_delay = float(max_delay)
8177
self._attempts = 0
8278
self._cur_delay = delay
@@ -99,6 +95,7 @@ def copy(self):
9995
obj = KazooRetry(max_tries=self.max_tries,
10096
delay=self.delay,
10197
backoff=self.backoff,
98+
max_jitter=self.max_jitter,
10299
max_delay=self.max_delay,
103100
sleep_func=self.sleep_func,
104101
deadline=self.deadline,
@@ -134,25 +131,24 @@ def __call__(self, func, *args, **kwargs):
134131
if self._attempts == self.max_tries:
135132
raise RetryFailedError("Too many retry attempts")
136133
self._attempts += 1
137-
sleeptime = random.randint(0, 1 + int(self._cur_delay))
134+
jitter = random.uniform(1.0-self.max_jitter,
135+
1.0+self.max_jitter)
136+
sleeptime = self._cur_delay * jitter
138137

139138
if self._cur_stoptime is not None and \
140139
time.time() + sleeptime >= self._cur_stoptime:
141140
raise RetryFailedError("Exceeded retry deadline")
142141

143142
if self.interrupt:
144-
while sleeptime > 0:
143+
remain_time = sleeptime
144+
while remain_time > 0:
145145
# Break the time period down and sleep for no
146146
# longer than 0.1 before calling the interrupt
147-
if sleeptime < 0.1:
148-
self.sleep_func(sleeptime)
149-
sleeptime -= sleeptime
150-
else:
151-
self.sleep_func(0.1)
152-
sleeptime -= 0.1
147+
self.sleep_func(min(0.1, remain_time))
148+
remain_time -= 0.1
153149
if self.interrupt():
154150
raise InterruptedError()
155151
else:
156152
self.sleep_func(sleeptime)
157-
self._cur_delay = min(self._cur_delay * self.backoff,
153+
self._cur_delay = min(sleeptime * self.backoff,
158154
self.max_delay)

0 commit comments

Comments
 (0)