Skip to content

Commit 9bb9617

Browse files
committed
Add retry logic with jitter for feature flag requests
Add exponential backoff with full jitter for feature flag API calls to handle transient failures like timeouts and connection errors. - Default 2 retries (3 total attempts) with exponential backoff - Full jitter to prevent thundering herd during incidents - Retries on: timeouts, connection errors, 5xx, 429 - No retry on: 4xx client errors, quota limit errors - Configurable via `feature_flag_retries` parameter on Client
1 parent b179280 commit 9bb9617

File tree

4 files changed

+432
-8
lines changed

4 files changed

+432
-8
lines changed

posthog/client.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
)
3535
from posthog.poller import Poller
3636
from posthog.request import (
37+
DEFAULT_FEATURE_FLAG_RETRIES,
3738
DEFAULT_HOST,
3839
APIError,
3940
QuotaLimitError,
@@ -196,6 +197,7 @@ def __init__(
196197
capture_exception_code_variables=False,
197198
code_variables_mask_patterns=None,
198199
code_variables_ignore_patterns=None,
200+
feature_flag_retries=DEFAULT_FEATURE_FLAG_RETRIES,
199201
):
200202
"""
201203
Initialize a new PostHog client instance.
@@ -215,6 +217,9 @@ def __init__(
215217
Category:
216218
Initialization
217219
"""
220+
if not isinstance(feature_flag_retries, int) or feature_flag_retries < 0:
221+
raise ValueError("feature_flag_retries must be a non-negative integer")
222+
218223
self.queue = queue.Queue(max_queue_size)
219224

220225
# api_key: This should be the Team API Key (token), public
@@ -237,6 +242,7 @@ def __init__(
237242
self.feature_flags_request_timeout_seconds = (
238243
feature_flags_request_timeout_seconds
239244
)
245+
self.feature_flag_retries = feature_flag_retries
240246
self.poller = None
241247
self.distinct_ids_feature_flags_reported = SizeLimitedDict(MAX_DICT_SIZE, set)
242248
self.flag_cache = self._initialize_flag_cache(flag_fallback_cache_url)
@@ -540,6 +546,7 @@ def get_flags_decision(
540546
self.api_key,
541547
self.host,
542548
timeout=self.feature_flags_request_timeout_seconds,
549+
retries=self.feature_flag_retries,
543550
**request_data,
544551
)
545552

posthog/request.py

Lines changed: 94 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,9 @@
66
from datetime import date, datetime
77
from gzip import GzipFile
88
from io import BytesIO
9-
from typing import Any, List, Optional, Tuple, Union
10-
9+
from typing import Any, Callable, List, Optional, Tuple, TypeVar, Union
1110

11+
import backoff
1212
import requests
1313
from dateutil.tz import tzutc
1414
from requests.adapters import HTTPAdapter # type: ignore[import-untyped]
@@ -42,6 +42,83 @@
4242
if hasattr(socket, attr):
4343
KEEP_ALIVE_SOCKET_OPTIONS.append((socket.SOL_TCP, getattr(socket, attr), value))
4444

45+
# Default number of retries for feature flag requests
46+
DEFAULT_FEATURE_FLAG_RETRIES = 2
47+
48+
T = TypeVar("T")
49+
50+
51+
def _should_giveup_on_exception(exc: Exception) -> bool:
52+
"""
53+
Determine if we should stop retrying based on the exception type.
54+
55+
Returns True to give up (stop retrying), False to continue retrying.
56+
"""
57+
# Note: APIError and QuotaLimitError are defined later in this module.
58+
# We use late binding here since this function is defined before those classes.
59+
if isinstance(exc, QuotaLimitError):
60+
# Don't retry quota limit errors - they won't resolve with retries
61+
return True
62+
if isinstance(exc, APIError):
63+
# Retry on server errors (5xx) and rate limits (429)
64+
# Don't retry on other client errors (4xx)
65+
if not isinstance(exc.status, int):
66+
return False
67+
return (400 <= exc.status < 500) and exc.status != 429
68+
# Retry on all other errors (network errors, timeouts, etc.)
69+
return False
70+
71+
72+
def _with_retries(
73+
fn: Callable[[], T],
74+
retries: int,
75+
giveup: Callable[[Exception], bool] = _should_giveup_on_exception,
76+
) -> T:
77+
"""
78+
Execute a function with exponential backoff retries and jitter.
79+
80+
Uses exponential backoff with full jitter to spread out retry attempts
81+
and prevent thundering herd problems during incidents.
82+
83+
Note: This application-level retry operates independently from urllib3's
84+
transport-level retries (configured in HTTPAdapterWithSocketOptions).
85+
With default settings, each backoff attempt may trigger up to 4 urllib3
86+
retries (2 connect + 2 read), so total network attempts could be higher
87+
than the retry count suggests.
88+
89+
Args:
90+
fn: The function to execute
91+
retries: Maximum number of retries (0 means no retries, just one attempt)
92+
giveup: Function that returns True if we should stop retrying for a given exception
93+
94+
Returns:
95+
The result of the function call
96+
"""
97+
log = logging.getLogger("posthog")
98+
99+
def on_backoff(details):
100+
log.warning(
101+
"Request failed (attempt %d/%d), retrying in %.2fs: %s",
102+
details["tries"],
103+
retries + 1,
104+
details["wait"],
105+
details["exception"],
106+
)
107+
108+
# max_tries = retries + 1 (e.g., retries=2 means 3 total attempts)
109+
@backoff.on_exception(
110+
backoff.expo,
111+
Exception,
112+
max_tries=retries + 1,
113+
giveup=giveup,
114+
jitter=backoff.full_jitter, # Adds randomness to prevent thundering herd
115+
on_backoff=on_backoff,
116+
)
117+
def execute():
118+
return fn()
119+
120+
return execute()
121+
45122

46123
def _mask_tokens_in_url(url: str) -> str:
47124
"""Mask token values in URLs for safe logging, keeping first 10 chars visible."""
@@ -219,13 +296,23 @@ def flags(
219296
host: Optional[str] = None,
220297
gzip: bool = False,
221298
timeout: int = 15,
299+
retries: int = DEFAULT_FEATURE_FLAG_RETRIES,
222300
**kwargs,
223301
) -> Any:
224-
"""Post the `kwargs to the flags API endpoint"""
225-
res = post(api_key, host, "/flags/?v=2", gzip, timeout, **kwargs)
226-
return _process_response(
227-
res, success_message="Feature flags evaluated successfully"
228-
)
302+
"""
303+
Post the kwargs to the flags API endpoint with automatic retries.
304+
305+
The retries parameter controls how many times to retry on transient failures
306+
(timeouts, connection errors, 5xx, 429). Set to 0 for no retries.
307+
"""
308+
309+
def make_request():
310+
res = post(api_key, host, "/flags/?v=2", gzip, timeout, **kwargs)
311+
return _process_response(
312+
res, success_message="Feature flags evaluated successfully"
313+
)
314+
315+
return _with_retries(make_request, retries)
229316

230317

231318
def remote_config(

posthog/test/test_client.py

Lines changed: 51 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
from posthog.client import Client
1111
from posthog.contexts import get_context_session_id, new_context, set_context_session
12-
from posthog.request import APIError, GetResponse
12+
from posthog.request import APIError, DEFAULT_FEATURE_FLAG_RETRIES, GetResponse
1313
from posthog.test.test_utils import FAKE_TEST_API_KEY
1414
from posthog.types import FeatureFlag, LegacyFlagMetadata
1515
from posthog.version import VERSION
@@ -640,6 +640,7 @@ def test_basic_capture_with_feature_flags_returns_active_only(self, patch_flags)
640640
"random_key",
641641
"https://us.i.posthog.com",
642642
timeout=3,
643+
retries=DEFAULT_FEATURE_FLAG_RETRIES,
643644
distinct_id="distinct_id",
644645
groups={},
645646
person_properties={},
@@ -704,6 +705,7 @@ def test_basic_capture_with_feature_flags_and_disable_geoip_returns_correctly(
704705
"random_key",
705706
"https://us.i.posthog.com",
706707
timeout=12,
708+
retries=DEFAULT_FEATURE_FLAG_RETRIES,
707709
distinct_id="distinct_id",
708710
groups={},
709711
person_properties={},
@@ -1821,6 +1823,7 @@ def test_disable_geoip_default_on_decide(self, patch_flags):
18211823
"random_key",
18221824
"https://us.i.posthog.com",
18231825
timeout=3,
1826+
retries=DEFAULT_FEATURE_FLAG_RETRIES,
18241827
distinct_id="some_id",
18251828
groups={},
18261829
person_properties={"distinct_id": "some_id"},
@@ -1836,6 +1839,7 @@ def test_disable_geoip_default_on_decide(self, patch_flags):
18361839
"random_key",
18371840
"https://us.i.posthog.com",
18381841
timeout=3,
1842+
retries=DEFAULT_FEATURE_FLAG_RETRIES,
18391843
distinct_id="feature_enabled_distinct_id",
18401844
groups={},
18411845
person_properties={"distinct_id": "feature_enabled_distinct_id"},
@@ -1849,6 +1853,7 @@ def test_disable_geoip_default_on_decide(self, patch_flags):
18491853
"random_key",
18501854
"https://us.i.posthog.com",
18511855
timeout=3,
1856+
retries=DEFAULT_FEATURE_FLAG_RETRIES,
18521857
distinct_id="all_flags_payloads_id",
18531858
groups={},
18541859
person_properties={"distinct_id": "all_flags_payloads_id"},
@@ -1894,6 +1899,7 @@ def test_default_properties_get_added_properly(self, patch_flags):
18941899
"random_key",
18951900
"http://app2.posthog.com",
18961901
timeout=3,
1902+
retries=DEFAULT_FEATURE_FLAG_RETRIES,
18971903
distinct_id="some_id",
18981904
groups={"company": "id:5", "instance": "app.posthog.com"},
18991905
person_properties={"distinct_id": "some_id", "x1": "y1"},
@@ -1921,6 +1927,7 @@ def test_default_properties_get_added_properly(self, patch_flags):
19211927
"random_key",
19221928
"http://app2.posthog.com",
19231929
timeout=3,
1930+
retries=DEFAULT_FEATURE_FLAG_RETRIES,
19241931
distinct_id="some_id",
19251932
groups={"company": "id:5", "instance": "app.posthog.com"},
19261933
person_properties={"distinct_id": "override"},
@@ -1941,13 +1948,56 @@ def test_default_properties_get_added_properly(self, patch_flags):
19411948
"random_key",
19421949
"http://app2.posthog.com",
19431950
timeout=3,
1951+
retries=DEFAULT_FEATURE_FLAG_RETRIES,
19441952
distinct_id="some_id",
19451953
groups={},
19461954
person_properties={"distinct_id": "some_id"},
19471955
group_properties={},
19481956
geoip_disable=False,
19491957
)
19501958

1959+
@mock.patch("posthog.client.flags")
1960+
def test_client_passes_custom_retry_count_to_flags(self, patch_flags):
1961+
"""Verify Client passes feature_flag_retries parameter to flags()."""
1962+
patch_flags.return_value = {
1963+
"featureFlags": {"test-flag": True},
1964+
"featureFlagPayloads": {},
1965+
"errorsWhileComputingFlags": False,
1966+
}
1967+
1968+
# Create client with custom retry count
1969+
client = Client(
1970+
FAKE_TEST_API_KEY,
1971+
host="https://test.posthog.com",
1972+
feature_flag_retries=5,
1973+
on_error=self.set_fail,
1974+
)
1975+
1976+
# Trigger flag evaluation
1977+
client.get_feature_flag("test-flag", "user123")
1978+
1979+
# Verify retries=5 was passed to flags()
1980+
patch_flags.assert_called_once()
1981+
call_kwargs = patch_flags.call_args[1]
1982+
self.assertEqual(call_kwargs["retries"], 5)
1983+
1984+
def test_feature_flag_retries_rejects_negative_value(self):
1985+
"""Client should reject negative feature_flag_retries values."""
1986+
with self.assertRaises(ValueError) as ctx:
1987+
Client(FAKE_TEST_API_KEY, feature_flag_retries=-1)
1988+
self.assertIn("non-negative integer", str(ctx.exception))
1989+
1990+
def test_feature_flag_retries_rejects_non_integer(self):
1991+
"""Client should reject non-integer feature_flag_retries values."""
1992+
with self.assertRaises(ValueError) as ctx:
1993+
Client(FAKE_TEST_API_KEY, feature_flag_retries="2")
1994+
self.assertIn("non-negative integer", str(ctx.exception))
1995+
1996+
def test_feature_flag_retries_accepts_zero(self):
1997+
"""Client should accept feature_flag_retries=0 (disables retries)."""
1998+
client = Client(FAKE_TEST_API_KEY, feature_flag_retries=0)
1999+
self.assertEqual(client.feature_flag_retries, 0)
2000+
19512001
@parameterized.expand(
19522002
[
19532003
# name, sys_platform, version_info, expected_runtime, expected_version, expected_os, expected_os_version, platform_method, platform_return, distro_info

0 commit comments

Comments
 (0)