Skip to content

Commit 2f9d516

Browse files
Joe Cummingsfacebook-github-bot
authored andcommitted
Fix supervision error message and printout for supervision_event (meta-pytorch#803)
Summary: Pull Request resolved: meta-pytorch#803 Updates the error message for exit from `supervision_event()`. Why? The message is clearer and now we can capture a nice stack trace b/c we aren't using the derived Debug of a `SupervisionEvent`, but rather grabbing the actor_id and using the Display of actor_status, which is where the error will be coming from. Reviewed By: vidhyav, zdevito Differential Revision: D79810924 fbshipit-source-id: 5e6b4608c565ad511a4ee60fa2e61111a2630f55
1 parent bce09c5 commit 2f9d516

File tree

2 files changed

+25
-13
lines changed

2 files changed

+25
-13
lines changed

monarch_hyperactor/src/actor_mesh.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -136,8 +136,8 @@ impl PythonActorMesh {
136136
Unhealthy::SoFarSoGood => (),
137137
Unhealthy::Crashed(event) => {
138138
return Err(SupervisionError::new_err(format!(
139-
"actor mesh is unhealthy with reason: {:?}",
140-
event
139+
"Actor {:?} is unhealthy with reason: {}",
140+
event.actor_id, event.actor_status
141141
)));
142142
}
143143
Unhealthy::StreamClosed => {
@@ -199,8 +199,8 @@ impl PythonActorMesh {
199199
},
200200
};
201201
Ok(PyErr::new::<SupervisionError, _>(format!(
202-
"supervision error: {:?}",
203-
event
202+
"Actor {:?} exited because of the following reason: {}",
203+
event.actor_id, event.actor_status
204204
)))
205205
})
206206
}

python/tests/test_actor_error.py

Lines changed: 21 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -521,11 +521,14 @@ async def test_actor_mesh_supervision_handling(mesh):
521521
await e.check.call()
522522

523523
# existing call should fail with supervision error
524-
with pytest.raises(SupervisionError, match="supervision error:"):
524+
with pytest.raises(
525+
SupervisionError,
526+
match=".*Actor .* exited because of the following reason",
527+
):
525528
await e.fail_with_supervision_error.call_one()
526529

527530
# new call should fail with check of health state of actor mesh
528-
with pytest.raises(SupervisionError, match="actor mesh is unhealthy with reason"):
531+
with pytest.raises(SupervisionError, match="Actor .* is unhealthy with reason"):
529532
await e.check.call()
530533

531534
# should not be able to spawn actors anymore as proc mesh is unhealthy
@@ -588,11 +591,14 @@ async def test_actor_mesh_supervision_handling_chained_error(mesh):
588591
# in a chain of client -> Intermediate -> ErrorActor, a supervision error
589592
# happening in ErrorActor will be captured by Intermediate and re-raised
590593
# as an application error (ActorError).
591-
with pytest.raises(ActorError, match="supervision error:"):
594+
with pytest.raises(
595+
ActorError,
596+
match=".*Actor .* exited because of the following reason",
597+
):
592598
await intermediate_actor.forward_error.call()
593599

594600
# calling success endpoint should fail with ActorError, but with supervision msg.
595-
with pytest.raises(ActorError, match="actor mesh is unhealthy with reason"):
601+
with pytest.raises(ActorError, match="Actor .* is unhealthy with reason"):
596602
await intermediate_actor.forward_success.call()
597603

598604
# healthy actor should still be working
@@ -621,11 +627,14 @@ async def test_base_exception_handling(mesh, method_name):
621627
method = getattr(error_actor, method_name)
622628

623629
# The call should raise a SupervisionError
624-
with pytest.raises(SupervisionError, match="supervision error:"):
630+
with pytest.raises(
631+
SupervisionError,
632+
match=".*Actor .* exited because of the following reason",
633+
):
625634
await method.call_one()
626635

627636
# Subsequent calls should fail with a health state error
628-
with pytest.raises(RuntimeError, match="actor mesh is unhealthy with reason"):
637+
with pytest.raises(RuntimeError, match="Actor .* is unhealthy with reason"):
629638
await error_actor.check.call()
630639

631640

@@ -665,11 +674,14 @@ async def test_supervision_with_sending_error():
665674
await actor_mesh.check_with_payload.call(payload="a")
666675

667676
# send a large payload to trigger send timeout error
668-
with pytest.raises(SupervisionError, match="supervision error:.*"):
677+
with pytest.raises(
678+
SupervisionError,
679+
match=".*Actor .* exited because of the following reason",
680+
):
669681
await actor_mesh.check_with_payload.call(payload="a" * 55000000)
670682

671683
# new call should fail with check of health state of actor mesh
672-
with pytest.raises(SupervisionError, match="actor mesh is unhealthy with reason:"):
684+
with pytest.raises(SupervisionError, match="Actor .* is unhealthy with reason"):
673685
await actor_mesh.check.call()
674-
with pytest.raises(SupervisionError, match="actor mesh is unhealthy with reason:"):
686+
with pytest.raises(SupervisionError, match="Actor .* is unhealthy with reason"):
675687
await actor_mesh.check_with_payload.call(payload="a")

0 commit comments

Comments
 (0)