Skip to content

Commit cb79ef6

Browse files
authored
Only Retry Certain Errors (#1330)
* only retry certain errors (not all) * write changelog * Do not retry after last attempt, improve test comments
1 parent bf0e00a commit cb79ef6

File tree

3 files changed

+93
-10
lines changed

3 files changed

+93
-10
lines changed

webknossos/Changelog.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ For upgrade instructions, please check the respective _Breaking Changes_ section
1717
### Added
1818

1919
### Changed
20+
- Limit custom retries (e.g. additional to tensorstore internal retries) to certain network errors ("Too Many Requests", "GatewayTimeout"). [#1330](https://github.com/scalableminds/webknossos-libs/pull/1330)
2021

2122
### Fixed
2223

webknossos/tests/test_utils.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
from unittest.mock import Mock, patch
2+
3+
import pytest
4+
5+
from webknossos.utils import call_with_retries
6+
7+
8+
def test_call_with_retries_success() -> None:
9+
"""Test that a successful function call returns immediately."""
10+
mock_fn = Mock(return_value="success")
11+
12+
result = call_with_retries(mock_fn)
13+
14+
assert result == "success"
15+
mock_fn.call_count == 1 # Function called only once (direct success)
16+
17+
18+
@patch("time.sleep")
19+
def test_call_with_retries_sucess_with_retry(mock_sleep: Mock) -> None:
20+
"""Test retry behavior when function succeeds after retryable failures."""
21+
mock_fn = Mock(
22+
side_effect=[
23+
Exception("Too Many Requests"),
24+
Exception("GatewayTimeout"),
25+
"success",
26+
]
27+
)
28+
29+
result = call_with_retries(mock_fn, num_retries=3, backoff_factor=2.0)
30+
31+
assert result == "success"
32+
assert mock_fn.call_count == 3 # Called once for each try
33+
assert mock_sleep.call_count == 2 # Sleep called twice for retries
34+
35+
36+
@patch("time.sleep")
37+
def test_call_with_retries_failure_after_retry(mock_sleep: Mock) -> None:
38+
"""Test retry behavior when function succeeds after retryable failures."""
39+
mock_fn = Mock(
40+
side_effect=[
41+
Exception("Too Many Requests"),
42+
Exception("GatewayTimeout"),
43+
"success",
44+
]
45+
)
46+
47+
with pytest.raises(Exception):
48+
call_with_retries(mock_fn, num_retries=2, backoff_factor=2.0)
49+
50+
assert mock_fn.call_count == 2
51+
assert (
52+
mock_sleep.call_count == 1
53+
) # Sleep called once after the first failure but not after the second/last failure
54+
55+
56+
@patch("time.sleep")
57+
def test_call_with_retries_direct_failure(mock_sleep: Mock) -> None:
58+
"""Test retry behavior when function succeeds after retryable failures."""
59+
mock_fn = Mock(
60+
side_effect=[
61+
RuntimeError("Non Retryable Runtime Error"),
62+
"success",
63+
]
64+
)
65+
66+
with pytest.raises(RuntimeError):
67+
call_with_retries(mock_fn, num_retries=5, backoff_factor=2.0)
68+
69+
assert mock_fn.call_count == 1 # Only called once, no retries
70+
assert mock_sleep.call_count == 0 # Sleep not called since it failed immediately

webknossos/webknossos/utils.py

Lines changed: 22 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -41,29 +41,41 @@
4141
ReturnType = TypeVar("ReturnType")
4242

4343

44+
def _is_exception_retryable(exception: Exception) -> bool:
45+
exception_str_lower = str(exception).lower()
46+
if "too many requests" in exception_str_lower or "gateway" in exception_str_lower:
47+
return True
48+
return False
49+
50+
4451
def call_with_retries(
4552
fn: Callable[[], ReturnType],
4653
num_retries: int = DEFAULT_NUM_RETRIES,
4754
description: str = "",
4855
backoff_factor: float = DEFAULT_BACKOFF_FACTOR,
4956
) -> ReturnType:
50-
"""Call a function, retrying up to `num_retries` times on an exception during the call. Useful for retrying requests or network io."""
57+
"""Call a function, retrying up to `num_retries` times on common retryable (network) exceptions. Useful for retrying requests or network io."""
5158
last_exception = None
52-
for i in range(num_retries):
59+
for current_retry_number in range(num_retries):
5360
try:
5461
return fn()
5562
except Exception as e: # noqa: PERF203 # allow try except in loop
56-
logger.warning(
57-
f"{description} attempt {i + 1}/{num_retries} failed, retrying..."
58-
f"Error was: {e}"
59-
)
60-
# We introduce some randomness to avoid multiple processes retrying at the same time
61-
random_factor = np.random.uniform(0.66, 1.5)
62-
time.sleep((backoff_factor**i) * random_factor)
6363
last_exception = e
64+
# We only sleep and retry if it was not the last attempt and the exception is retryable and.
65+
if current_retry_number < num_retries - 1 and _is_exception_retryable(e):
66+
logger.warning(
67+
f"{description} attempt {current_retry_number + 1}/{num_retries} failed, retrying..."
68+
f"Error was: {e}"
69+
)
70+
# We introduce some randomness to avoid multiple processes retrying at the same time
71+
random_factor = np.random.uniform(0.66, 1.5)
72+
time.sleep((backoff_factor**current_retry_number) * random_factor)
73+
else:
74+
break
6475
# If the last attempt fails, we log the error and raise it.
6576
# This is important to avoid silent failures.
66-
logger.error(f"{description} failed after {num_retries} attempts.")
77+
if current_retry_number > 0:
78+
logger.error(f"{description} failed after {current_retry_number + 1} attempts.")
6779
assert last_exception is not None, "last_exception should never be None here"
6880
raise last_exception
6981

0 commit comments

Comments
 (0)