Skip to content

Commit fde1949

Browse files
dulinrileyfacebook-github-bot
authored andcommitted
Change ActorError print to TracebackException to capture context (#765)
Summary: Pull Request resolved: #765 The current ActorError printout didn't include nested exceptions, which can be critical for debugging if there are large wrapper try/except blocks. Use the `TracebackException.format` method to print the exception instead to include the nested exceptions. Reviewed By: zdevito Differential Revision: D79664075 fbshipit-source-id: 7c4ca55131567e753a527aae74930ba7873734d6
1 parent 0b5d959 commit fde1949

File tree

2 files changed

+70
-10
lines changed

2 files changed

+70
-10
lines changed

python/monarch/_src/actor/actor_mesh.py

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,9 @@
1616
import traceback
1717

1818
from dataclasses import dataclass
19-
from traceback import extract_tb, StackSummary
19+
from traceback import TracebackException
2020
from typing import (
2121
Any,
22-
AsyncGenerator,
2322
Awaitable,
2423
Callable,
2524
cast,
@@ -32,7 +31,6 @@
3231
Iterator,
3332
List,
3433
Literal,
35-
NamedTuple,
3634
Optional,
3735
overload,
3836
ParamSpec,
@@ -902,16 +900,25 @@ def __init__(
902900
message: str = "A remote actor call has failed.",
903901
) -> None:
904902
self.exception = exception
905-
self.actor_mesh_ref_frames: StackSummary = extract_tb(exception.__traceback__)
903+
# Need to stringify the exception early, because the PyPI package
904+
# exceptiongroup may monkeypatch the "TracebackException" class for python
905+
# versions < 3.11. If it gets unpickled in a different scope without
906+
# using that monkeypatch, it'll have an exception in "format()".
907+
# Store the traceback string instead which shouldn't change between machines.
908+
actor_mesh_ref_tb = TracebackException.from_exception(exception).format()
909+
# Replace any traceback lines to indicate it's a remote call traceback.
910+
actor_mesh_ref_tb = (
911+
s.replace(
912+
"Traceback (most recent call last):",
913+
"Traceback of where the remote call failed (most recent call last):",
914+
)
915+
for s in actor_mesh_ref_tb
916+
)
917+
self.exception_formatted = "".join(actor_mesh_ref_tb)
906918
self.message = message
907919

908920
def __str__(self) -> str:
909-
exe = str(self.exception)
910-
actor_mesh_ref_tb = "".join(traceback.format_list(self.actor_mesh_ref_frames))
911-
return (
912-
f"{self.message}\n"
913-
f"Traceback of where the remote call failed (most recent call last):\n{actor_mesh_ref_tb}{type(self.exception).__name__}: {exe}"
914-
)
921+
return f"{self.message}\n {self.exception_formatted}"
915922

916923

917924
def current_actor_name() -> str:

python/tests/test_actor_error.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,24 @@ def raise_exception(self) -> None:
3434
raise Exception("This is a test exception")
3535

3636

37+
class NestedExceptionActor(Actor):
38+
@endpoint
39+
async def raise_exception_with_context(self) -> None:
40+
try:
41+
raise Exception("Inner exception")
42+
except Exception:
43+
# Don't use from here to set __context__ instead of __cause__
44+
raise Exception("Outer exception")
45+
46+
@endpoint
47+
async def raise_exception_with_cause(self) -> None:
48+
try:
49+
raise Exception("Inner exception")
50+
except Exception as e:
51+
# Use from here to set __cause__ instead of __context__
52+
raise Exception("Outer exception") from e
53+
54+
3755
class BrokenPickleClass:
3856
"""A class that can be configured to raise exceptions during pickling/unpickling."""
3957

@@ -116,6 +134,41 @@ def test_actor_exception_sync(mesh, actor_class, num_procs):
116134
exception_actor.raise_exception.call().get()
117135

118136

137+
@pytest.mark.parametrize(
138+
"mesh",
139+
[local_proc_mesh, proc_mesh],
140+
ids=["local_proc_mesh", "distributed_proc_mesh"],
141+
)
142+
async def test_actor_error_message(mesh):
143+
"""
144+
Test that exceptions raised in actor endpoints capture nested exceptions.
145+
"""
146+
proc = mesh(gpus=2)
147+
exception_actor = await proc.spawn("exception_actor", NestedExceptionActor)
148+
149+
with pytest.raises(ActorError) as exc_info:
150+
await exception_actor.raise_exception_with_cause.call()
151+
152+
# Make sure both exception messages are present in the message.
153+
assert "Inner exception" in str(exc_info.value)
154+
assert "Outer exception" in str(exc_info.value)
155+
# Make sure the "cause" is set.
156+
assert "The above exception was the direct cause of the following exception" in str(
157+
exc_info.value
158+
)
159+
160+
with pytest.raises(ActorError) as exc_info:
161+
await exception_actor.raise_exception_with_context.call()
162+
163+
# Make sure both exception messages are present in the message.
164+
assert "Inner exception" in str(exc_info.value)
165+
assert "Outer exception" in str(exc_info.value)
166+
# Make sure the "cause" is set.
167+
assert "During handling of the above exception, another exception occurred" in str(
168+
exc_info.value
169+
)
170+
171+
119172
'''
120173
# oss_skip: importlib not pulling resource correctly in git CI, needs to be revisited
121174
@pytest.mark.oss_skip

0 commit comments

Comments
 (0)