Skip to content

Commit 462e8c0

Browse files
committed
Add retry logic with jitter for feature flag requests
Add exponential backoff with full jitter for feature flag API calls to handle transient failures like timeouts and connection errors. - Default 2 retries (3 total attempts) with exponential backoff - Full jitter to prevent thundering herd during incidents - Retries on: timeouts, connection errors, 5xx, 429 - No retry on: 4xx client errors, quota limit errors - Configurable via `feature_flag_retries` parameter on Client
1 parent b179280 commit 462e8c0

File tree

4 files changed

+410
-8
lines changed

4 files changed

+410
-8
lines changed

posthog/client.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
)
3535
from posthog.poller import Poller
3636
from posthog.request import (
37+
DEFAULT_FEATURE_FLAG_RETRIES,
3738
DEFAULT_HOST,
3839
APIError,
3940
QuotaLimitError,
@@ -196,6 +197,7 @@ def __init__(
196197
capture_exception_code_variables=False,
197198
code_variables_mask_patterns=None,
198199
code_variables_ignore_patterns=None,
200+
feature_flag_retries=DEFAULT_FEATURE_FLAG_RETRIES,
199201
):
200202
"""
201203
Initialize a new PostHog client instance.
@@ -237,6 +239,7 @@ def __init__(
237239
self.feature_flags_request_timeout_seconds = (
238240
feature_flags_request_timeout_seconds
239241
)
242+
self.feature_flag_retries = feature_flag_retries
240243
self.poller = None
241244
self.distinct_ids_feature_flags_reported = SizeLimitedDict(MAX_DICT_SIZE, set)
242245
self.flag_cache = self._initialize_flag_cache(flag_fallback_cache_url)
@@ -540,6 +543,7 @@ def get_flags_decision(
540543
self.api_key,
541544
self.host,
542545
timeout=self.feature_flags_request_timeout_seconds,
546+
retries=self.feature_flag_retries,
543547
**request_data,
544548
)
545549

posthog/request.py

Lines changed: 92 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,9 @@
66
from datetime import date, datetime
77
from gzip import GzipFile
88
from io import BytesIO
9-
from typing import Any, List, Optional, Tuple, Union
10-
9+
from typing import Any, Callable, List, Optional, Tuple, TypeVar, Union
1110

11+
import backoff
1212
import requests
1313
from dateutil.tz import tzutc
1414
from requests.adapters import HTTPAdapter # type: ignore[import-untyped]
@@ -42,6 +42,83 @@
4242
if hasattr(socket, attr):
4343
KEEP_ALIVE_SOCKET_OPTIONS.append((socket.SOL_TCP, getattr(socket, attr), value))
4444

45+
# Default number of retries for feature flag requests
46+
DEFAULT_FEATURE_FLAG_RETRIES = 2
47+
48+
T = TypeVar("T")
49+
50+
51+
def _should_giveup_on_exception(exc: Exception) -> bool:
52+
"""
53+
Determine if we should stop retrying based on the exception type.
54+
55+
Returns True to give up (stop retrying), False to continue retrying.
56+
"""
57+
# Note: APIError and QuotaLimitError are defined later in this module.
58+
# We use late binding here since this function is defined before those classes.
59+
if isinstance(exc, QuotaLimitError):
60+
# Don't retry quota limit errors - they won't resolve with retries
61+
return True
62+
if isinstance(exc, APIError):
63+
# Retry on server errors (5xx) and rate limits (429)
64+
# Don't retry on other client errors (4xx)
65+
if not isinstance(exc.status, int):
66+
return False
67+
return (400 <= exc.status < 500) and exc.status != 429
68+
# Retry on all other errors (network errors, timeouts, etc.)
69+
return False
70+
71+
72+
def _with_retries(
73+
fn: Callable[[], T],
74+
retries: int,
75+
giveup: Callable[[Exception], bool] = _should_giveup_on_exception,
76+
) -> T:
77+
"""
78+
Execute a function with exponential backoff retries and jitter.
79+
80+
Uses exponential backoff with full jitter to spread out retry attempts
81+
and prevent thundering herd problems during incidents.
82+
83+
Note: This application-level retry operates independently from urllib3's
84+
transport-level retries (configured in HTTPAdapterWithSocketOptions).
85+
With default settings, each backoff attempt may trigger up to 4 urllib3
86+
retries (2 connect + 2 read), so total network attempts could be higher
87+
than the retry count suggests.
88+
89+
Args:
90+
fn: The function to execute
91+
retries: Maximum number of retries (0 means no retries, just one attempt)
92+
giveup: Function that returns True if we should stop retrying for a given exception
93+
94+
Returns:
95+
The result of the function call
96+
"""
97+
log = logging.getLogger("posthog")
98+
99+
def on_backoff(details):
100+
log.warning(
101+
"Request failed (attempt %d/%d), retrying in %.2fs: %s",
102+
details["tries"],
103+
retries + 1,
104+
details["wait"],
105+
details["exception"],
106+
)
107+
108+
# max_tries = retries + 1 (e.g., retries=2 means 3 total attempts)
109+
@backoff.on_exception(
110+
backoff.expo,
111+
Exception,
112+
max_tries=retries + 1,
113+
giveup=giveup,
114+
jitter=backoff.full_jitter, # Adds randomness to prevent thundering herd
115+
on_backoff=on_backoff,
116+
)
117+
def execute():
118+
return fn()
119+
120+
return execute()
121+
45122

46123
def _mask_tokens_in_url(url: str) -> str:
47124
"""Mask token values in URLs for safe logging, keeping first 10 chars visible."""
@@ -219,13 +296,21 @@ def flags(
219296
host: Optional[str] = None,
220297
gzip: bool = False,
221298
timeout: int = 15,
299+
retries: int = DEFAULT_FEATURE_FLAG_RETRIES,
222300
**kwargs,
223301
) -> Any:
224-
"""Post the `kwargs to the flags API endpoint"""
225-
res = post(api_key, host, "/flags/?v=2", gzip, timeout, **kwargs)
226-
return _process_response(
227-
res, success_message="Feature flags evaluated successfully"
228-
)
302+
"""Post the `kwargs to the flags API endpoint with automatic retries."""
303+
304+
def make_request():
305+
res = post(api_key, host, "/flags/?v=2", gzip, timeout, **kwargs)
306+
return _process_response(
307+
res, success_message="Feature flags evaluated successfully"
308+
)
309+
310+
if retries > 0:
311+
return _with_retries(make_request, retries)
312+
else:
313+
return make_request()
229314

230315

231316
def remote_config(

posthog/test/test_client.py

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
from posthog.client import Client
1111
from posthog.contexts import get_context_session_id, new_context, set_context_session
12-
from posthog.request import APIError, GetResponse
12+
from posthog.request import APIError, DEFAULT_FEATURE_FLAG_RETRIES, GetResponse
1313
from posthog.test.test_utils import FAKE_TEST_API_KEY
1414
from posthog.types import FeatureFlag, LegacyFlagMetadata
1515
from posthog.version import VERSION
@@ -640,6 +640,7 @@ def test_basic_capture_with_feature_flags_returns_active_only(self, patch_flags)
640640
"random_key",
641641
"https://us.i.posthog.com",
642642
timeout=3,
643+
retries=DEFAULT_FEATURE_FLAG_RETRIES,
643644
distinct_id="distinct_id",
644645
groups={},
645646
person_properties={},
@@ -704,6 +705,7 @@ def test_basic_capture_with_feature_flags_and_disable_geoip_returns_correctly(
704705
"random_key",
705706
"https://us.i.posthog.com",
706707
timeout=12,
708+
retries=DEFAULT_FEATURE_FLAG_RETRIES,
707709
distinct_id="distinct_id",
708710
groups={},
709711
person_properties={},
@@ -1821,6 +1823,7 @@ def test_disable_geoip_default_on_decide(self, patch_flags):
18211823
"random_key",
18221824
"https://us.i.posthog.com",
18231825
timeout=3,
1826+
retries=DEFAULT_FEATURE_FLAG_RETRIES,
18241827
distinct_id="some_id",
18251828
groups={},
18261829
person_properties={"distinct_id": "some_id"},
@@ -1836,6 +1839,7 @@ def test_disable_geoip_default_on_decide(self, patch_flags):
18361839
"random_key",
18371840
"https://us.i.posthog.com",
18381841
timeout=3,
1842+
retries=DEFAULT_FEATURE_FLAG_RETRIES,
18391843
distinct_id="feature_enabled_distinct_id",
18401844
groups={},
18411845
person_properties={"distinct_id": "feature_enabled_distinct_id"},
@@ -1849,6 +1853,7 @@ def test_disable_geoip_default_on_decide(self, patch_flags):
18491853
"random_key",
18501854
"https://us.i.posthog.com",
18511855
timeout=3,
1856+
retries=DEFAULT_FEATURE_FLAG_RETRIES,
18521857
distinct_id="all_flags_payloads_id",
18531858
groups={},
18541859
person_properties={"distinct_id": "all_flags_payloads_id"},
@@ -1894,6 +1899,7 @@ def test_default_properties_get_added_properly(self, patch_flags):
18941899
"random_key",
18951900
"http://app2.posthog.com",
18961901
timeout=3,
1902+
retries=DEFAULT_FEATURE_FLAG_RETRIES,
18971903
distinct_id="some_id",
18981904
groups={"company": "id:5", "instance": "app.posthog.com"},
18991905
person_properties={"distinct_id": "some_id", "x1": "y1"},
@@ -1921,6 +1927,7 @@ def test_default_properties_get_added_properly(self, patch_flags):
19211927
"random_key",
19221928
"http://app2.posthog.com",
19231929
timeout=3,
1930+
retries=DEFAULT_FEATURE_FLAG_RETRIES,
19241931
distinct_id="some_id",
19251932
groups={"company": "id:5", "instance": "app.posthog.com"},
19261933
person_properties={"distinct_id": "override"},
@@ -1941,13 +1948,39 @@ def test_default_properties_get_added_properly(self, patch_flags):
19411948
"random_key",
19421949
"http://app2.posthog.com",
19431950
timeout=3,
1951+
retries=DEFAULT_FEATURE_FLAG_RETRIES,
19441952
distinct_id="some_id",
19451953
groups={},
19461954
person_properties={"distinct_id": "some_id"},
19471955
group_properties={},
19481956
geoip_disable=False,
19491957
)
19501958

1959+
@mock.patch("posthog.client.flags")
1960+
def test_client_passes_custom_retry_count_to_flags(self, patch_flags):
1961+
"""Verify Client passes feature_flag_retries parameter to flags()."""
1962+
patch_flags.return_value = {
1963+
"featureFlags": {"test-flag": True},
1964+
"featureFlagPayloads": {},
1965+
"errorsWhileComputingFlags": False,
1966+
}
1967+
1968+
# Create client with custom retry count
1969+
client = Client(
1970+
FAKE_TEST_API_KEY,
1971+
host="https://test.posthog.com",
1972+
feature_flag_retries=5,
1973+
on_error=self.set_fail,
1974+
)
1975+
1976+
# Trigger flag evaluation
1977+
client.get_feature_flag("test-flag", "user123")
1978+
1979+
# Verify retries=5 was passed to flags()
1980+
patch_flags.assert_called_once()
1981+
call_kwargs = patch_flags.call_args[1]
1982+
self.assertEqual(call_kwargs["retries"], 5)
1983+
19511984
@parameterized.expand(
19521985
[
19531986
# name, sys_platform, version_info, expected_runtime, expected_version, expected_os, expected_os_version, platform_method, platform_return, distro_info

0 commit comments

Comments
 (0)