Skip to content

Commit 1378621

Browse files
authored
Merge pull request #69 from meshcore-dev/2.1.7
2.1.7 - Additional improvements to request tracking and retries
2 parents 1ea4e7b + b5957c0 commit 1378621

File tree

6 files changed

+189
-37
lines changed

6 files changed

+189
-37
lines changed

custom_components/meshcore/config_flow.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
CONF_REPEATER_UPDATE_INTERVAL,
3535
CONF_REPEATER_TELEMETRY_ENABLED,
3636
DEFAULT_REPEATER_UPDATE_INTERVAL,
37+
MIN_UPDATE_INTERVAL,
3738
CONF_TRACKED_CLIENTS,
3839
CONF_CLIENT_NAME,
3940
CONF_CLIENT_UPDATE_INTERVAL,
@@ -449,7 +450,7 @@ def _show_add_repeater_form(self, repeater_dict, errors=None, user_input=None):
449450
vol.Required(CONF_REPEATER_NAME): vol.In(repeater_dict.keys()),
450451
vol.Optional(CONF_REPEATER_PASSWORD, default=default_password): str,
451452
vol.Optional(CONF_REPEATER_TELEMETRY_ENABLED, default=default_telemetry): bool,
452-
vol.Optional(CONF_REPEATER_UPDATE_INTERVAL, default=default_interval): vol.All(cv.positive_int, vol.Range(min=300, max=3600)),
453+
vol.Optional(CONF_REPEATER_UPDATE_INTERVAL, default=default_interval): vol.All(cv.positive_int, vol.Range(min=MIN_UPDATE_INTERVAL)),
453454
}),
454455
errors=errors,
455456
)
@@ -600,7 +601,7 @@ async def async_step_add_client(self, user_input=None):
600601
step_id="add_client",
601602
data_schema=vol.Schema({
602603
vol.Required(CONF_CLIENT_NAME): vol.In(client_dict.keys()),
603-
vol.Optional(CONF_CLIENT_UPDATE_INTERVAL, default=DEFAULT_CLIENT_UPDATE_INTERVAL): vol.All(cv.positive_int, vol.Range(min=600, max=7200)),
604+
vol.Optional(CONF_CLIENT_UPDATE_INTERVAL, default=DEFAULT_CLIENT_UPDATE_INTERVAL): vol.All(cv.positive_int, vol.Range(min=MIN_UPDATE_INTERVAL)),
604605
}),
605606
errors=errors,
606607
)
@@ -622,7 +623,7 @@ async def async_step_add_client(self, user_input=None):
622623
step_id="add_client",
623624
data_schema=vol.Schema({
624625
vol.Required(CONF_CLIENT_NAME): vol.In(client_dict.keys()),
625-
vol.Optional(CONF_CLIENT_UPDATE_INTERVAL, default=DEFAULT_CLIENT_UPDATE_INTERVAL): vol.All(cv.positive_int, vol.Range(min=600, max=7200)),
626+
vol.Optional(CONF_CLIENT_UPDATE_INTERVAL, default=DEFAULT_CLIENT_UPDATE_INTERVAL): vol.All(cv.positive_int, vol.Range(min=MIN_UPDATE_INTERVAL)),
626627
}),
627628
errors=errors,
628629
)
@@ -816,7 +817,7 @@ async def async_step_edit_repeater(self, user_input=None):
816817
data_schema=vol.Schema({
817818
vol.Optional(CONF_REPEATER_PASSWORD, default=repeater.get("password", "")): str,
818819
vol.Optional(CONF_REPEATER_TELEMETRY_ENABLED, default=repeater.get("telemetry_enabled", False)): bool,
819-
vol.Optional(CONF_REPEATER_UPDATE_INTERVAL, default=repeater.get("update_interval", DEFAULT_REPEATER_UPDATE_INTERVAL)): vol.All(cv.positive_int, vol.Range(min=300, max=3600)),
820+
vol.Optional(CONF_REPEATER_UPDATE_INTERVAL, default=repeater.get("update_interval", DEFAULT_REPEATER_UPDATE_INTERVAL)): vol.All(cv.positive_int, vol.Range(min=MIN_UPDATE_INTERVAL)),
820821
}),
821822
description_placeholders={
822823
"device_name": repeater.get("name", "Unknown")
@@ -852,7 +853,7 @@ async def async_step_edit_client(self, user_input=None):
852853
return self.async_show_form(
853854
step_id="edit_client",
854855
data_schema=vol.Schema({
855-
vol.Optional(CONF_CLIENT_UPDATE_INTERVAL, default=client.get("update_interval", DEFAULT_CLIENT_UPDATE_INTERVAL)): vol.All(cv.positive_int, vol.Range(min=600, max=7200)),
856+
vol.Optional(CONF_CLIENT_UPDATE_INTERVAL, default=client.get("update_interval", DEFAULT_CLIENT_UPDATE_INTERVAL)): vol.All(cv.positive_int, vol.Range(min=MIN_UPDATE_INTERVAL)),
856857
}),
857858
description_placeholders={
858859
"device_name": client.get("name", "Unknown")

custom_components/meshcore/const.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@
5656
CONF_REPEATER_UPDATE_INTERVAL: Final = "update_interval"
5757
CONF_REPEATER_TELEMETRY_ENABLED: Final = "telemetry_enabled"
5858
DEFAULT_REPEATER_UPDATE_INTERVAL: Final = 900 # 15 minutes in seconds
59+
MIN_UPDATE_INTERVAL: Final = 300 # 5 minutes minimum
5960
MAX_REPEATER_FAILURES_BEFORE_LOGIN: Final = 5 # After this many failures, try login
6061

6162
# Client tracking constants
@@ -76,6 +77,8 @@
7677
# Backoff constants for repeater failures
7778
REPEATER_BACKOFF_BASE: Final = 2 # Base multiplier for exponential backoff
7879
REPEATER_BACKOFF_MAX_MULTIPLIER: Final = 120 # Maximum backoff multiplier (10 minutes when * 5 seconds)
80+
MAX_FAILURES_BEFORE_PATH_RESET: Final = 3 # Reset path after this many failures
81+
MAX_RETRY_ATTEMPTS: Final = 5 # Maximum retry attempts within refresh window
7982

8083

8184
# Generic battery voltage to percentage lookup table

custom_components/meshcore/coordinator.py

Lines changed: 64 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@
3030
MAX_REPEATER_FAILURES_BEFORE_LOGIN,
3131
REPEATER_BACKOFF_BASE,
3232
REPEATER_BACKOFF_MAX_MULTIPLIER,
33+
MAX_FAILURES_BEFORE_PATH_RESET,
34+
MAX_RETRY_ATTEMPTS,
3335
CONF_CONTACT_REFRESH_INTERVAL,
3436
DEFAULT_CONTACT_REFRESH_INTERVAL,
3537
CONF_REPEATER_TELEMETRY_ENABLED,
@@ -128,6 +130,34 @@ def __init__(
128130

129131
if not hasattr(self, "last_update_success_time"):
130132
self.last_update_success_time = self._current_time()
133+
134+
# Initialize reliability stats tracking
135+
self._reliability_stats = {}
136+
137+
def _increment_success(self, pubkey_prefix: str) -> None:
138+
"""Increment success counter for a node."""
139+
stats_key = f"{pubkey_prefix}_request_successes"
140+
self._reliability_stats[stats_key] = self._reliability_stats.get(stats_key, 0) + 1
141+
142+
def _increment_failure(self, pubkey_prefix: str) -> None:
143+
"""Increment failure counter for a node."""
144+
stats_key = f"{pubkey_prefix}_request_failures"
145+
self._reliability_stats[stats_key] = self._reliability_stats.get(stats_key, 0) + 1
146+
147+
async def _reset_node_path(self, contact, node_name: str) -> bool:
148+
"""Reset routing path for a node and return success status."""
149+
try:
150+
result = await self.api.mesh_core.commands.reset_path(contact)
151+
if result and result.type != EventType.ERROR:
152+
self.logger.info(f"Successfully reset path for {node_name}")
153+
return True
154+
else:
155+
error_msg = result.payload if result and result.type == EventType.ERROR else "no response or unexpected result"
156+
self.logger.warning(f"Failed to reset path for {node_name}: {error_msg}")
157+
return False
158+
except Exception as ex:
159+
self.logger.warning(f"Exception resetting path for {node_name}: {ex}")
160+
return False
131161

132162
def update_telemetry_settings(self, config_entry: ConfigEntry) -> None:
133163
"""Update telemetry settings from config entry."""
@@ -190,15 +220,19 @@ async def _update_repeater(self, repeater_config):
190220
repeater_config.get(CONF_REPEATER_PASSWORD, "")
191221
)
192222

193-
if login_result.type == EventType.ERROR:
194-
self.logger.error(f"Login to repeater {repeater_name} failed: {login_result.payload}")
195-
else:
223+
if login_result and login_result.type != None and login_result.type != EventType.ERROR:
196224
self.logger.info(f"Successfully logged in to repeater {repeater_name}")
225+
self._increment_success(pubkey_prefix)
197226
# Track login time for telemetry refresh
198227
self._repeater_login_times[pubkey_prefix] = self._current_time()
228+
else:
229+
error_msg = login_result.payload if login_result and login_result.type == EventType.ERROR else "timeout or no response"
230+
self.logger.error(f"Login to repeater {repeater_name} failed: {error_msg}")
231+
self._increment_failure(pubkey_prefix)
199232

200233
except Exception as ex:
201234
self.logger.error(f"Exception during login to repeater {repeater_name}: {ex}")
235+
self._increment_failure(pubkey_prefix)
202236

203237
# Reset failures after login attempt regardless of outcome
204238
# This prevents repeated login attempts if they keep failing
@@ -219,27 +253,26 @@ async def _update_repeater(self, repeater_config):
219253
# Increment failure count and apply backoff
220254
new_failure_count = failure_count + 1
221255
self._repeater_consecutive_failures[pubkey_prefix] = new_failure_count
256+
self._increment_failure(pubkey_prefix)
222257

223-
# Reset path after 5 failures if there's an established path
224-
if new_failure_count == 5 and contact and contact.get("out_path_len", 0) != -1:
225-
try:
226-
await self.api.mesh_core.commands.reset_path(pubkey_prefix)
227-
self.logger.info(f"Reset path for repeater {repeater_name} after 5 failures")
228-
except Exception as ex:
229-
self.logger.warning(f"Failed to reset path for repeater {repeater_name}: {ex}")
258+
# Reset path after configured failures if there's an established path
259+
if new_failure_count == MAX_FAILURES_BEFORE_PATH_RESET and contact and contact.get("out_path_len", -1) >= 0:
260+
await self._reset_node_path(contact, repeater_name)
230261

231262
update_interval = repeater_config.get(CONF_REPEATER_UPDATE_INTERVAL, DEFAULT_REPEATER_UPDATE_INTERVAL)
232263
self._apply_repeater_backoff(pubkey_prefix, new_failure_count, update_interval)
233264
elif result.payload.get('uptime', 0) == 0:
234265
self.logger.warn(f"Malformed status response from repeater {repeater_name}: {result.payload}")
235266
new_failure_count = failure_count + 1
236267
self._repeater_consecutive_failures[pubkey_prefix] = new_failure_count
268+
self._increment_failure(pubkey_prefix)
237269
update_interval = repeater_config.get(CONF_REPEATER_UPDATE_INTERVAL, DEFAULT_REPEATER_UPDATE_INTERVAL)
238270
self._apply_repeater_backoff(pubkey_prefix, new_failure_count, update_interval)
239271
else:
240272
self.logger.debug(f"Successfully updated repeater {repeater_name}")
241273
# Reset failure count on success
242274
self._repeater_consecutive_failures[pubkey_prefix] = 0
275+
self._increment_success(pubkey_prefix)
243276

244277
# Trigger state updates for any entities listening for this repeater
245278
self.async_set_updated_data(self.data)
@@ -254,6 +287,7 @@ async def _update_repeater(self, repeater_config):
254287
# Increment failure count and apply backoff
255288
new_failure_count = self._repeater_consecutive_failures.get(pubkey_prefix, 0) + 1
256289
self._repeater_consecutive_failures[pubkey_prefix] = new_failure_count
290+
self._increment_failure(pubkey_prefix)
257291
update_interval = repeater_config.get(CONF_REPEATER_UPDATE_INTERVAL, DEFAULT_REPEATER_UPDATE_INTERVAL)
258292
self._apply_repeater_backoff(pubkey_prefix, new_failure_count, update_interval)
259293
finally:
@@ -264,13 +298,21 @@ async def _update_repeater(self, repeater_config):
264298
def _apply_backoff(self, pubkey_prefix: str, failure_count: int, update_interval: int, update_type: str = "repeater") -> None:
265299
"""Apply exponential backoff delay for failed updates.
266300
301+
Uses dynamic base interval to ensure max 5 retries within the refresh window.
302+
Resets failure count after MAX_RETRY_ATTEMPTS to start fresh.
303+
267304
Args:
268305
pubkey_prefix: The node's public key prefix
269306
failure_count: Number of consecutive failures
270307
update_interval: The configured update interval to cap the backoff at
271308
update_type: Type of update ("repeater" or "telemetry")
272309
"""
273-
backoff_delay = min(REPEATER_BACKOFF_BASE ** failure_count, update_interval)
310+
# Calculate base interval to fit 5 retries within refresh window
311+
# Sum of geometric series: base * (2^5 - 1) / (2 - 1) = base * 31
312+
# We want this to be roughly half the refresh interval for safety
313+
base_interval = max(1, update_interval // (31 * 2))
314+
315+
backoff_delay = min(base_interval * (REPEATER_BACKOFF_BASE ** failure_count), update_interval)
274316
next_update_time = self._current_time() + backoff_delay
275317

276318
if update_type == "telemetry":
@@ -280,6 +322,7 @@ def _apply_backoff(self, pubkey_prefix: str, failure_count: int, update_interval
280322

281323
self.logger.debug(f"Applied backoff for {update_type} {pubkey_prefix}: "
282324
f"failure_count={failure_count}, "
325+
f"base_interval={base_interval}s, "
283326
f"delay={backoff_delay}s, "
284327
f"interval_cap={update_interval}s")
285328

@@ -313,6 +356,7 @@ async def _update_node_telemetry(self, contact, pubkey_prefix: str, node_name: s
313356
self.logger.debug(f"Telemetry response received from {node_name}: {telemetry_result}")
314357
# Reset failure count on success
315358
self._telemetry_consecutive_failures[pubkey_prefix] = 0
359+
self._increment_success(pubkey_prefix)
316360
# Schedule next telemetry update
317361
next_telemetry_time = self._current_time() + update_interval
318362
self._next_telemetry_update_times[pubkey_prefix] = next_telemetry_time
@@ -321,14 +365,11 @@ async def _update_node_telemetry(self, contact, pubkey_prefix: str, node_name: s
321365
# Increment failure count and apply backoff
322366
new_failure_count = failure_count + 1
323367
self._telemetry_consecutive_failures[pubkey_prefix] = new_failure_count
368+
self._increment_failure(pubkey_prefix)
324369

325-
# Reset path after 5 failures if there's an established path
326-
if new_failure_count == 5 and contact and contact.get("out_path_len", 0) != -1:
327-
try:
328-
await self.api.mesh_core.commands.reset_path(pubkey_prefix)
329-
self.logger.info(f"Reset path for node {node_name} after 5 telemetry failures")
330-
except Exception as ex:
331-
self.logger.warning(f"Failed to reset path for node {node_name}: {ex}")
370+
# Reset path after configured failures if there's an established path
371+
if new_failure_count == MAX_FAILURES_BEFORE_PATH_RESET and contact and contact.get("out_path_len", -1) >= 0:
372+
await self._reset_node_path(contact, node_name)
332373

333374
self._apply_backoff(pubkey_prefix, new_failure_count, update_interval, "telemetry")
334375

@@ -337,12 +378,13 @@ async def _update_node_telemetry(self, contact, pubkey_prefix: str, node_name: s
337378
# Increment failure count and apply backoff
338379
new_failure_count = failure_count + 1
339380
self._telemetry_consecutive_failures[pubkey_prefix] = new_failure_count
381+
self._increment_failure(pubkey_prefix)
340382

341-
# Reset path after 5 failures if there's an established path
342-
if new_failure_count == 5 and contact and contact.get("out_path_len", 0) != -1:
383+
# Reset path after configured failures if there's an established path
384+
if new_failure_count == MAX_FAILURES_BEFORE_PATH_RESET and contact and contact.get("out_path_len", -1) != -1:
343385
try:
344386
await self.api.mesh_core.commands.reset_path(pubkey_prefix)
345-
self.logger.info(f"Reset path for node {node_name} after 5 telemetry failures")
387+
self.logger.info(f"Reset path for node {node_name} after {MAX_FAILURES_BEFORE_PATH_RESET} telemetry failures")
346388
except Exception as reset_ex:
347389
self.logger.warning(f"Failed to reset path for node {node_name}: {reset_ex}")
348390

@@ -479,7 +521,7 @@ async def _async_update_data(self) -> None:
479521
pubkey_prefix = repeater_config.get("pubkey_prefix")
480522
repeater_name = repeater_config.get("name")
481523

482-
# Clean up completed or failed tasks
524+
# Clean c completed or failed tasks
483525
if pubkey_prefix in self._active_repeater_tasks:
484526
task = self._active_repeater_tasks[pubkey_prefix]
485527
if task.done():

0 commit comments

Comments
 (0)