Skip to content

Commit 578692f

Browse files
committed
feat: better reconnect gherkins
Signed-off-by: Simon Schrottner <[email protected]>
1 parent 335cce7 commit 578692f

File tree

26 files changed

+837
-348
lines changed

26 files changed

+837
-348
lines changed

providers/openfeature-provider-flagd/pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ classifiers = [
1818
keywords = []
1919
dependencies = [
2020
"openfeature-sdk>=0.6.0",
21-
"grpcio>=1.68.0",
21+
"grpcio>=1.68.1",
2222
"protobuf>=4.25.2",
2323
"mmh3>=4.1.0",
2424
"panzi-json-logic>=1.0.1",

providers/openfeature-provider-flagd/pytest.ini

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,15 @@ markers =
44
in-process: tests for rpc mode.
55
customCert: Supports custom certs.
66
unixsocket: Supports unixsockets.
7+
targetURI: Supports targetURI.
8+
grace: Supports grace attempts.
9+
targeting: Supports targeting.
10+
fractional: Supports fractional.
11+
string: Supports string.
12+
semver: Supports semver.
13+
reconnect: Supports reconnect.
714
events: Supports events.
815
sync: Supports sync.
916
caching: Supports caching.
1017
offline: Supports offline.
18+
bdd_features_base_dir = tests/features

providers/openfeature-provider-flagd/src/openfeature/contrib/provider/flagd/config.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ class CacheType(Enum):
2626
DEFAULT_RESOLVER_TYPE = ResolverType.RPC
2727
DEFAULT_RETRY_BACKOFF = 1000
2828
DEFAULT_RETRY_BACKOFF_MAX = 120000
29-
DEFAULT_RETRY_GRACE_ATTEMPTS = 5
29+
DEFAULT_RETRY_GRACE_PERIOD = 5
3030
DEFAULT_STREAM_DEADLINE = 600000
3131
DEFAULT_TLS = False
3232

@@ -41,7 +41,7 @@ class CacheType(Enum):
4141
ENV_VAR_RESOLVER_TYPE = "FLAGD_RESOLVER"
4242
ENV_VAR_RETRY_BACKOFF_MS = "FLAGD_RETRY_BACKOFF_MS"
4343
ENV_VAR_RETRY_BACKOFF_MAX_MS = "FLAGD_RETRY_BACKOFF_MAX_MS"
44-
ENV_VAR_RETRY_GRACE_ATTEMPTS = "FLAGD_RETRY_GRACE_ATTEMPTS"
44+
ENV_VAR_RETRY_GRACE_PERIOD = "FLAGD_RETRY_GRACE_PERIOD"
4545
ENV_VAR_STREAM_DEADLINE_MS = "FLAGD_STREAM_DEADLINE_MS"
4646
ENV_VAR_TLS = "FLAGD_TLS"
4747

@@ -81,7 +81,7 @@ def __init__( # noqa: PLR0913
8181
offline_poll_interval_ms: typing.Optional[int] = None,
8282
retry_backoff_ms: typing.Optional[int] = None,
8383
retry_backoff_max_ms: typing.Optional[int] = None,
84-
retry_grace_attempts: typing.Optional[int] = None,
84+
retry_grace_period: typing.Optional[int] = None,
8585
deadline_ms: typing.Optional[int] = None,
8686
stream_deadline_ms: typing.Optional[int] = None,
8787
keep_alive_time: typing.Optional[int] = None,
@@ -115,14 +115,14 @@ def __init__( # noqa: PLR0913
115115
else retry_backoff_max_ms
116116
)
117117

118-
self.retry_grace_attempts: int = (
118+
self.retry_grace_period: int = (
119119
int(
120120
env_or_default(
121-
ENV_VAR_RETRY_GRACE_ATTEMPTS, DEFAULT_RETRY_GRACE_ATTEMPTS, cast=int
121+
ENV_VAR_RETRY_GRACE_PERIOD, DEFAULT_RETRY_GRACE_PERIOD, cast=int
122122
)
123123
)
124-
if retry_grace_attempts is None
125-
else retry_grace_attempts
124+
if retry_grace_period is None
125+
else retry_grace_period
126126
)
127127

128128
self.resolver = (

providers/openfeature-provider-flagd/src/openfeature/contrib/provider/flagd/provider.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -43,34 +43,34 @@ def __init__( # noqa: PLR0913
4343
host: typing.Optional[str] = None,
4444
port: typing.Optional[int] = None,
4545
tls: typing.Optional[bool] = None,
46-
deadline: typing.Optional[int] = None,
46+
deadline_ms: typing.Optional[int] = None,
4747
timeout: typing.Optional[int] = None,
4848
retry_backoff_ms: typing.Optional[int] = None,
4949
resolver_type: typing.Optional[ResolverType] = None,
5050
offline_flag_source_path: typing.Optional[str] = None,
5151
stream_deadline_ms: typing.Optional[int] = None,
5252
keep_alive_time: typing.Optional[int] = None,
53-
cache_type: typing.Optional[CacheType] = None,
53+
cache: typing.Optional[CacheType] = None,
5454
max_cache_size: typing.Optional[int] = None,
5555
retry_backoff_max_ms: typing.Optional[int] = None,
56-
retry_grace_attempts: typing.Optional[int] = None,
56+
retry_grace_period: typing.Optional[int] = None,
5757
):
5858
"""
5959
Create an instance of the FlagdProvider
6060
6161
:param host: the host to make requests to
6262
:param port: the port the flagd service is available on
6363
:param tls: enable/disable secure TLS connectivity
64-
:param deadline: the maximum to wait before a request times out
64+
:param deadline_ms: the maximum to wait before a request times out
6565
:param timeout: the maximum time to wait before a request times out
6666
:param retry_backoff_ms: the number of milliseconds to backoff
6767
:param offline_flag_source_path: the path to the flag source file
6868
:param stream_deadline_ms: the maximum time to wait before a request times out
6969
:param keep_alive_time: the number of milliseconds to keep alive
7070
:param resolver_type: the type of resolver to use
7171
"""
72-
if deadline is None and timeout is not None:
73-
deadline = timeout * 1000
72+
if deadline_ms is None and timeout is not None:
73+
deadline_ms = timeout * 1000
7474
warnings.warn(
7575
"'timeout' property is deprecated, please use 'deadline' instead, be aware that 'deadline' is in milliseconds",
7676
DeprecationWarning,
@@ -81,15 +81,15 @@ def __init__( # noqa: PLR0913
8181
host=host,
8282
port=port,
8383
tls=tls,
84-
deadline_ms=deadline,
84+
deadline_ms=deadline_ms,
8585
retry_backoff_ms=retry_backoff_ms,
8686
retry_backoff_max_ms=retry_backoff_max_ms,
87-
retry_grace_attempts=retry_grace_attempts,
87+
retry_grace_period=retry_grace_period,
8888
resolver=resolver_type,
8989
offline_flag_source_path=offline_flag_source_path,
9090
stream_deadline_ms=stream_deadline_ms,
9191
keep_alive_time=keep_alive_time,
92-
cache=cache_type,
92+
cache=cache,
9393
max_cache_size=max_cache_size,
9494
)
9595

providers/openfeature-provider-flagd/src/openfeature/contrib/provider/flagd/resolvers/grpc.py

Lines changed: 91 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from cachebox import BaseCacheImpl, LRUCache
88
from google.protobuf.json_format import MessageToDict
99
from google.protobuf.struct_pb2 import Struct
10+
from grpc import ChannelConnectivity
1011

1112
from openfeature.evaluation_context import EvaluationContext
1213
from openfeature.event import ProviderEventDetails
@@ -47,53 +48,60 @@ def __init__(
4748
[ProviderEventDetails], None
4849
],
4950
):
51+
self.active = False
5052
self.config = config
5153
self.emit_provider_ready = emit_provider_ready
5254
self.emit_provider_error = emit_provider_error
5355
self.emit_provider_stale = emit_provider_stale
5456
self.emit_provider_configuration_changed = emit_provider_configuration_changed
55-
self.cache: typing.Optional[BaseCacheImpl] = (
56-
LRUCache(maxsize=self.config.max_cache_size)
57-
if self.config.cache == CacheType.LRU
58-
else None
59-
)
60-
self.stub, self.channel = self._create_stub()
61-
self.retry_backoff_seconds = config.retry_backoff_ms * 0.001
62-
self.retry_backoff_max_seconds = config.retry_backoff_max_ms * 0.001
63-
self.retry_grace_attempts = config.retry_grace_attempts
57+
self.cache: typing.Optional[BaseCacheImpl] = self._create_cache()
58+
59+
self.retry_grace_period = config.retry_grace_period
6460
self.streamline_deadline_seconds = config.stream_deadline_ms * 0.001
6561
self.deadline = config.deadline_ms * 0.001
6662
self.connected = False
67-
68-
def _create_stub(
69-
self,
70-
) -> typing.Tuple[evaluation_pb2_grpc.ServiceStub, grpc.Channel]:
71-
config = self.config
7263
channel_factory = grpc.secure_channel if config.tls else grpc.insecure_channel
73-
channel = channel_factory(
64+
65+
# Create the channel with the service config
66+
options = [
67+
("grpc.keepalive_time_ms", config.keep_alive_time),
68+
("grpc.initial_reconnect_backoff_ms", config.retry_backoff_ms),
69+
("grpc.max_reconnect_backoff_ms", config.retry_backoff_max_ms),
70+
("grpc.min_reconnect_backoff_ms", config.deadline_ms),
71+
]
72+
73+
self.channel = channel_factory(
7474
f"{config.host}:{config.port}",
75-
options=(("grpc.keepalive_time_ms", config.keep_alive_time),),
75+
options=options,
7676
)
77-
stub = evaluation_pb2_grpc.ServiceStub(channel)
77+
self.stub = evaluation_pb2_grpc.ServiceStub(self.channel)
7878

79-
return stub, channel
79+
self.thread: typing.Optional[threading.Thread] = None
80+
self.timer: typing.Optional[threading.Timer] = None
81+
self.active = False
82+
83+
def _create_cache(self):
84+
return (
85+
LRUCache(maxsize=self.config.max_cache_size)
86+
if self.config.cache == CacheType.LRU
87+
else None
88+
)
8089

8190
def initialize(self, evaluation_context: EvaluationContext) -> None:
8291
self.connect()
8392

8493
def shutdown(self) -> None:
8594
self.active = False
8695
self.channel.close()
87-
if self.cache:
88-
self.cache.clear()
8996

9097
def connect(self) -> None:
9198
self.active = True
92-
self.thread = threading.Thread(
93-
target=self.listen, daemon=True, name="FlagdGrpcServiceWorkerThread"
94-
)
95-
self.thread.start()
9699

100+
# Run monitoring in a separate thread
101+
self.monitor_thread = threading.Thread(
102+
target=self.monitor, daemon=True, name="FlagdGrpcServiceMonitorThread"
103+
)
104+
self.monitor_thread.start()
97105
## block until ready or deadline reached
98106
timeout = self.deadline + time.time()
99107
while not self.connected and time.time() < timeout:
@@ -105,81 +113,87 @@ def connect(self) -> None:
105113
"Blocking init finished before data synced. Consider increasing startup deadline to avoid inconsistent evaluations."
106114
)
107115

116+
def monitor(self) -> None:
117+
def state_change_callback(new_state: ChannelConnectivity) -> None:
118+
logger.debug(f"gRPC state change: {new_state}")
119+
if new_state == ChannelConnectivity.READY:
120+
if not self.thread or not self.thread.is_alive():
121+
self.thread = threading.Thread(
122+
target=self.listen,
123+
daemon=True,
124+
name="FlagdGrpcServiceWorkerThread",
125+
)
126+
self.thread.start()
127+
128+
if self.timer and self.timer.is_alive():
129+
logger.debug("gRPC error timer expired")
130+
self.timer.cancel()
131+
132+
elif new_state == ChannelConnectivity.TRANSIENT_FAILURE:
133+
# this is the failed reonnect attempt so we are going into stale
134+
self.emit_provider_stale(
135+
ProviderEventDetails(
136+
message="gRPC sync disconnected, reconnecting",
137+
)
138+
)
139+
# adding a timer, so we can emit the error event after time
140+
self.timer = threading.Timer(self.retry_grace_period, self.emit_error)
141+
142+
logger.debug("gRPC error timer started")
143+
self.timer.start()
144+
self.connected = False
145+
146+
self.channel.subscribe(state_change_callback, try_to_connect=True)
147+
148+
def emit_error(self) -> None:
149+
logger.debug("gRPC error emitted")
150+
if self.cache is not None:
151+
self.cache.clear()
152+
self.emit_provider_error(
153+
ProviderEventDetails(
154+
message="gRPC sync disconnected, reconnecting",
155+
error_code=ErrorCode.GENERAL,
156+
)
157+
)
158+
108159
def listen(self) -> None:
109-
retry_delay = self.retry_backoff_seconds
160+
logger.info("gRPC starting listener thread")
110161
call_args = (
111162
{"timeout": self.streamline_deadline_seconds}
112163
if self.streamline_deadline_seconds > 0
113164
else {}
114165
)
115-
retry_counter = 0
116-
while self.active:
117-
request = evaluation_pb2.EventStreamRequest()
166+
request = evaluation_pb2.EventStreamRequest()
118167

168+
# defining a never ending loop to recreate the stream
169+
while self.active:
119170
try:
120-
logger.debug("Setting up gRPC sync flags connection")
121-
for message in self.stub.EventStream(request, **call_args):
171+
logger.info("Setting up gRPC sync flags connection")
172+
for message in self.stub.EventStream(
173+
request, wait_for_ready=True, **call_args
174+
):
122175
if message.type == "provider_ready":
123-
if not self.connected:
124-
self.emit_provider_ready(
125-
ProviderEventDetails(
126-
message="gRPC sync connection established"
127-
)
176+
self.connected = True
177+
self.emit_provider_ready(
178+
ProviderEventDetails(
179+
message="gRPC sync connection established"
128180
)
129-
self.connected = True
130-
retry_counter = 0
131-
# reset retry delay after successsful read
132-
retry_delay = self.retry_backoff_seconds
133-
181+
)
134182
elif message.type == "configuration_change":
135183
data = MessageToDict(message)["data"]
136184
self.handle_changed_flags(data)
137185

138186
if not self.active:
139187
logger.info("Terminating gRPC sync thread")
140188
return
141-
except grpc.RpcError as e:
142-
logger.error(f"SyncFlags stream error, {e.code()=} {e.details()=}")
143-
# re-create the stub if there's a connection issue - otherwise reconnect does not work as expected
144-
self.stub, self.channel = self._create_stub()
189+
except grpc.RpcError as e: # noqa: PERF203
190+
# although it seems like this error log is not interesting, without it, the retry is not working as expected
191+
logger.debug(f"SyncFlags stream error, {e.code()=} {e.details()=}")
145192
except ParseError:
146193
logger.exception(
147194
f"Could not parse flag data using flagd syntax: {message=}"
148195
)
149196

150-
self.connected = False
151-
self.on_connection_error(retry_counter, retry_delay)
152-
153-
retry_delay = self.handle_retry(retry_counter, retry_delay)
154-
155-
retry_counter = retry_counter + 1
156-
157-
def handle_retry(self, retry_counter: int, retry_delay: float) -> float:
158-
if retry_counter == 0:
159-
logger.info("gRPC sync disconnected, reconnecting immediately")
160-
else:
161-
logger.info(f"gRPC sync disconnected, reconnecting in {retry_delay}s")
162-
time.sleep(retry_delay)
163-
retry_delay = min(1.1 * retry_delay, self.retry_backoff_max_seconds)
164-
return retry_delay
165-
166-
def on_connection_error(self, retry_counter: int, retry_delay: float) -> None:
167-
if retry_counter == self.retry_grace_attempts:
168-
if self.cache:
169-
self.cache.clear()
170-
self.emit_provider_error(
171-
ProviderEventDetails(
172-
message=f"gRPC sync disconnected, reconnecting in {retry_delay}s",
173-
error_code=ErrorCode.GENERAL,
174-
)
175-
)
176-
elif retry_counter == 1:
177-
self.emit_provider_stale(
178-
ProviderEventDetails(
179-
message=f"gRPC sync disconnected, reconnecting in {retry_delay}s",
180-
)
181-
)
182-
183197
def handle_changed_flags(self, data: typing.Any) -> None:
184198
changed_flags = list(data["flags"].keys())
185199

providers/openfeature-provider-flagd/src/openfeature/contrib/provider/flagd/resolvers/process/file_watcher.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,17 +31,19 @@ def __init__(
3131
self.last_modified = 0.0
3232
self.flag_data: typing.Mapping[str, Flag] = {}
3333
self.load_data()
34+
self.active = True
3435
self.thread = threading.Thread(target=self.refresh_file, daemon=True)
3536
self.thread.start()
3637

3738
def shutdown(self) -> None:
39+
self.active = False
3840
pass
3941

4042
def get_flag(self, key: str) -> typing.Optional[Flag]:
4143
return self.flag_data.get(key)
4244

4345
def refresh_file(self) -> None:
44-
while True:
46+
while self.active:
4547
time.sleep(self.poll_interval_seconds)
4648
logger.debug("checking for new flag store contents from file")
4749
last_modified = os.path.getmtime(self.file_path)

0 commit comments

Comments
 (0)