Skip to content

Commit cf2c10b

Browse files
authored
Make retries and backoff configurable (#1323)
* Make retries and backoff configurable * Write changelog
1 parent 8b410d4 commit cf2c10b

File tree

4 files changed

+47
-35
lines changed

4 files changed

+47
-35
lines changed

webknossos/Changelog.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ For upgrade instructions, please check the respective _Breaking Changes_ section
1818
- Added `layers_to_ignore` argument to `Dataset.copy_dataset`. [#1321](https://github.com/scalableminds/webknossos-libs/pull/1321)
1919

2020
### Changed
21+
- Make number of retries and backoff factor configurable (mainly for tensorstore reads/writes). See `DEFAULT_NUM_RETRIES` and `DEFAULT_BACKOFF_FACTOR` environment variables. [#1323](https://github.com/scalableminds/webknossos-libs/pull/1323)
2122

2223
### Fixed
2324

webknossos/webknossos/dataset/_array.py

Lines changed: 2 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
import re
2-
import time
32
from abc import ABC, abstractmethod
4-
from collections.abc import Callable, Iterable, Iterator
3+
from collections.abc import Iterable, Iterator
54
from dataclasses import dataclass
65
from functools import lru_cache
76
from logging import getLogger
@@ -11,7 +10,6 @@
1110
from typing import (
1211
Any,
1312
Literal,
14-
TypeVar,
1513
)
1614
from urllib.parse import urlparse
1715

@@ -22,15 +20,13 @@
2220
from upath import UPath
2321

2422
from ..geometry import BoundingBox, NDBoundingBox, Vec3Int, VecInt
25-
from ..utils import is_fs_path
23+
from ..utils import call_with_retries, is_fs_path
2624
from .data_format import DataFormat
2725

2826
logger = getLogger(__name__)
2927

3028
TS_CONTEXT = tensorstore.Context()
3129

32-
DEFAULT_NUM_RETRIES = 5
33-
3430

3531
def _is_power_of_two(num: int) -> bool:
3632
return num & (num - 1) == 0
@@ -40,35 +36,6 @@ class ArrayException(Exception):
4036
pass
4137

4238

43-
ReturnType = TypeVar("ReturnType")
44-
45-
46-
def call_with_retries(
47-
fn: Callable[[], ReturnType],
48-
num_retries: int = DEFAULT_NUM_RETRIES,
49-
description: str = "",
50-
) -> ReturnType:
51-
"""Call a function, retrying up to `num_retries` times on an exception during the call. Useful for retrying requests or network io."""
52-
last_exception = None
53-
for i in range(num_retries):
54-
try:
55-
return fn()
56-
except Exception as e: # noqa: PERF203 # allow try except in loop
57-
logger.warning(
58-
f"{description} attempt {i + 1}/{num_retries} failed, retrying..."
59-
f"Error was: {e}"
60-
)
61-
# We introduce some randomness to avoid multiple processes retrying at the same time
62-
random_factor = np.random.uniform(0.5, 1.5)
63-
time.sleep(1 * (1.5**i) * random_factor)
64-
last_exception = e
65-
# If the last attempt fails, we log the error and raise it.
66-
# This is important to avoid silent failures.
67-
logger.error(f"{description} failed after {num_retries} attempts.")
68-
assert last_exception is not None, "last_exception should never be None here"
69-
raise last_exception
70-
71-
7239
@dataclass
7340
class ArrayInfo:
7441
data_format: DataFormat

webknossos/webknossos/dataset/defaults.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,3 +59,14 @@ def _rebuild_sslcontext(protocol: ssl._SSLMethod, cafile: str | None) -> ssl.SSL
5959
ZATTRS_FILE_NAME = ".zattrs"
6060
ZARR_JSON_FILE_NAME = "zarr.json"
6161
SSL_CONTEXT = _create_sslcontext()
62+
63+
DEFAULT_NUM_RETRIES = (
64+
int(os.environ["DEFAULT_NUM_RETRIES"])
65+
if "DEFAULT_NUM_RETRIES" in os.environ
66+
else 20
67+
)
68+
DEFAULT_BACKOFF_FACTOR = (
69+
float(os.environ["DEFAULT_BACKOFF_FACTOR"])
70+
if "DEFAULT_BACKOFF_FACTOR" in os.environ
71+
else 1.75
72+
)

webknossos/webknossos/utils.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,41 @@
3131
from rich.progress import Progress
3232
from upath import UPath
3333

34+
from .dataset.defaults import DEFAULT_BACKOFF_FACTOR, DEFAULT_NUM_RETRIES
35+
36+
logger = logging.getLogger(__name__)
37+
3438
times = {}
3539

40+
ReturnType = TypeVar("ReturnType")
41+
42+
43+
def call_with_retries(
44+
fn: Callable[[], ReturnType],
45+
num_retries: int = DEFAULT_NUM_RETRIES,
46+
description: str = "",
47+
backoff_factor: float = DEFAULT_BACKOFF_FACTOR,
48+
) -> ReturnType:
49+
"""Call a function, retrying up to `num_retries` times on an exception during the call. Useful for retrying requests or network io."""
50+
last_exception = None
51+
for i in range(num_retries):
52+
try:
53+
return fn()
54+
except Exception as e: # noqa: PERF203 # allow try except in loop
55+
logger.warning(
56+
f"{description} attempt {i + 1}/{num_retries} failed, retrying..."
57+
f"Error was: {e}"
58+
)
59+
# We introduce some randomness to avoid multiple processes retrying at the same time
60+
random_factor = np.random.uniform(0.66, 1.5)
61+
time.sleep((backoff_factor**i) * random_factor)
62+
last_exception = e
63+
# If the last attempt fails, we log the error and raise it.
64+
# This is important to avoid silent failures.
65+
logger.error(f"{description} failed after {num_retries} attempts.")
66+
assert last_exception is not None, "last_exception should never be None here"
67+
raise last_exception
68+
3669

3770
def time_start(identifier: str) -> None:
3871
times[identifier] = time.time()

0 commit comments

Comments
 (0)