Skip to content

Commit 9490d2b

Browse files
committed
works
1 parent 6dd28f4 commit 9490d2b

File tree

1 file changed

+95
-0
lines changed

1 file changed

+95
-0
lines changed

05_scheduling/custom_retries.py

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
# ---
2+
# cmd: ["modal", "run", "05_scheduling.custom_retries"]
3+
# ---
4+
5+
# # Custom retries by exception type
6+
# There are two types of retries in Modal:
7+
# 1. When a function execution is interrupted by [preemption](https://modal.com/docs/guide/preemption#preemption), the input will be retried. This behavior is not configurable at this time.
8+
# 2. When a function execution fails, ie by a raised Exception, Modal will retry the function call if you have [`modal.Retries`](https://modal.com/docs/reference/modal.Retries) configured.
9+
# This example is about customizing the latter to only retry on certain exception types.
10+
# For example, you may only want to retry on certain expected errors (e.g. timeouts, or
11+
# transient network errors) and crash immediately on others (e.g. OOM, bad input).
12+
13+
# The trick is to:
14+
# 1. Raise retryable errors in the usual way to trigger `modal.Retries`
15+
# 2. Catch and `return` non-retryable errors.
16+
# For #2, Modal will see a successful function call execution and return the exception
17+
# to your client/server to handle as desired.
18+
19+
import modal
20+
21+
app = modal.App("example-custom-retries")
22+
23+
# ## Define retryable vs. crashable exceptions
24+
25+
retry_exceptions = (
26+
TimeoutError,
27+
ConnectionError,
28+
# transient CUDA errors, network blips, etc.
29+
)
30+
31+
crashable_exceptions = (
32+
MemoryError,
33+
ValueError,
34+
# OOM, bad input — retrying won't help
35+
)
36+
37+
# ## Use a Dict to track call count across retries
38+
#
39+
# Each retry runs in a new container invocation, so we use a
40+
# [`modal.Dict`](https://modal.com/docs/reference/modal.Dict) to share
41+
# state and make the demo deterministic.
42+
43+
call_counter = modal.Dict.from_name(
44+
"custom-retries-demo-counter", create_if_missing=True
45+
)
46+
47+
# ## Demo App
48+
#
49+
# This function follows a scripted sequence to demonstrate the behavior:
50+
#
51+
# 1. **Call 1** — raises `TimeoutError` (retryable → Modal retries)
52+
# 2. **Call 2** — raises `ConnectionError` (retryable → Modal retries)
53+
# 3. **Call 3** — raises `MemoryError` (crashable → returned, no more retries)
54+
#
55+
# So you'll see two retries, then a clean stop on the third attempt.
56+
57+
58+
@app.function(retries=modal.Retries(max_retries=5, initial_delay=1.0))
59+
def flaky_task():
60+
call_count = call_counter.get("calls", 0) + 1
61+
call_counter["calls"] = call_count
62+
print(f"Attempt {call_count}")
63+
64+
# Scripted error sequence
65+
errors = [
66+
TimeoutError("GPU timed out"), # attempt 1: retryable
67+
ConnectionError("lost connection to data server"), # attempt 2: retryable
68+
MemoryError("CUDA out of memory"), # attempt 3: crashable
69+
]
70+
error = errors[min(call_count, len(errors)) - 1]
71+
72+
print(f" Hit: {error!r}")
73+
74+
if isinstance(error, retry_exceptions):
75+
print(" -> retryable, re-raising so Modal retries")
76+
raise error
77+
78+
# Return instead of raise — Modal sees success, stops retrying
79+
print(" -> non-retryable, returning error to stop retries")
80+
return error
81+
82+
83+
# ## Entrypoint
84+
#
85+
# The caller checks whether the return value is an exception.
86+
87+
88+
@app.local_entrypoint()
89+
def main():
90+
call_counter["calls"] = 0 # reset counter
91+
result = flaky_task.remote()
92+
if isinstance(result, Exception):
93+
print(f"Stopped with non-retryable error: {result!r}")
94+
else:
95+
print(f"Result: {result}")

0 commit comments

Comments
 (0)