Skip to content

Commit 3a17922

Browse files
pzhan9meta-codesync[bot]
authored andcommitted
Log supervision events in Handler<SupervisionFailureMessage> (#1951)
Summary: Pull Request resolved: #1951 As title. Reviewed By: shayne-fletcher Differential Revision: D87493413 fbshipit-source-id: 1cf22deb039367b886f81bbceabf69f3b7cebc68
1 parent 54b7a06 commit 3a17922

File tree

3 files changed

+41
-5
lines changed

3 files changed

+41
-5
lines changed

monarch_hyperactor/src/actor.rs

Lines changed: 38 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -899,9 +899,13 @@ impl Handler<SupervisionFailureMessage> for PythonActor {
899899
// TODO: We also don't want to deliver multiple supervision
900900
// events from the same mesh if an earlier one is handled.
901901
tracing::info!(
902-
"__supervise__ on {} handled a supervision event, not reporting any further: {}",
902+
name = "ActorMeshStatus",
903+
status = "SupervisionError::Handled",
904+
// only care about the event sender when the message is handled
905+
actor_name = message.actor_mesh_name,
906+
event = %message.event,
907+
"__supervise__ on {} handled a supervision event, not reporting any further",
903908
cx.self_id(),
904-
message.event
905909
);
906910
Ok(())
907911
} else {
@@ -913,6 +917,22 @@ impl Handler<SupervisionFailureMessage> for PythonActor {
913917

914918
// False -- we propagate the event onward, but update it with the fact that
915919
// this actor is now the event creator.
920+
for (actor_name, status) in [
921+
(
922+
message.actor_mesh_name.as_str(),
923+
"SupervisionError::Unhandled",
924+
),
925+
(cx.self_id().name(), "UnhandledSupervisionEvent"),
926+
] {
927+
tracing::info!(
928+
name = "ActorMeshStatus",
929+
status,
930+
actor_name,
931+
event = %message.event,
932+
"__supervise__ on {} did not handle a supervision event, reporting to the next next owner",
933+
cx.self_id(),
934+
);
935+
}
916936
let err = ActorErrorKind::UnhandledSupervisionEvent(Box::new(
917937
ActorSupervisionEvent::new(
918938
cx.self_id().clone(),
@@ -932,6 +952,22 @@ impl Handler<SupervisionFailureMessage> for PythonActor {
932952
// Include the event it was handling in the error message.
933953

934954
// Add to caused_by chain.
955+
for (actor_name, status) in [
956+
(
957+
message.actor_mesh_name.as_str(),
958+
"SupervisionError::__supervise__::exception",
959+
),
960+
(cx.self_id().name(), "UnhandledSupervisionEvent"),
961+
] {
962+
tracing::info!(
963+
name = "ActorMeshStatus",
964+
status,
965+
actor_name,
966+
event = %message.event,
967+
"__supervise__ on {} threw an exception",
968+
cx.self_id(),
969+
);
970+
}
935971
let err = ActorErrorKind::UnhandledSupervisionEvent(Box::new(
936972
ActorSupervisionEvent::new(
937973
cx.self_id().clone(),

monarch_hyperactor/src/supervision.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ create_exception!(
2424

2525
#[derive(Clone, Debug, Serialize, Deserialize, Named, PartialEq, Bind, Unbind)]
2626
pub struct SupervisionFailureMessage {
27-
pub mesh_name: String,
27+
pub actor_mesh_name: String,
2828
pub rank: usize,
2929
pub event: ActorSupervisionEvent,
3030
}
@@ -54,7 +54,7 @@ impl MeshFailure {
5454
impl From<SupervisionFailureMessage> for MeshFailure {
5555
fn from(message: SupervisionFailureMessage) -> Self {
5656
Self {
57-
mesh_name: message.mesh_name,
57+
mesh_name: message.actor_mesh_name,
5858
rank: message.rank,
5959
event: message.event,
6060
}

monarch_hyperactor/src/v1/actor_mesh.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -443,7 +443,7 @@ fn send_state_change<F>(
443443
// Send a notification to the owning actor of this mesh, if there is one.
444444
if let Some(owner) = owner {
445445
if let Err(error) = owner.send(SupervisionFailureMessage {
446-
mesh_name: actor_mesh_name.to_string(),
446+
actor_mesh_name: actor_mesh_name.to_string(),
447447
rank,
448448
event: event.clone(),
449449
}) {

0 commit comments

Comments
 (0)