Skip to content

Commit ec8f2dd

Browse files
pzhan9meta-codesync[bot]
authored andcommitted
Log events for SupervisionError::UnhandledFaultHook (meta-pytorch#1987)
Summary: Pull Request resolved: meta-pytorch#1987 Reviewed By: moonli Differential Revision: D87787311 fbshipit-source-id: 8616af081d967aa772f5a6776555200a6cd68b5f
1 parent d88c6f0 commit ec8f2dd

File tree

1 file changed

+15
-0
lines changed

1 file changed

+15
-0
lines changed

monarch_hyperactor/src/v1/actor_mesh.rs

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -266,6 +266,11 @@ impl PythonActorMeshImpl {
266266
.extract::<i32>()
267267
.unwrap();
268268
tracing::error!(
269+
name = "ActorMeshStatus",
270+
status = "SupervisionError::UnhandledFaultHook",
271+
actor_name = failure.mesh_name,
272+
event = %failure.event,
273+
rank = failure.rank,
269274
"unhandled event reached unhandled_fault_hook: {}, which is exiting the process with code {}",
270275
failure,
271276
code
@@ -275,6 +280,11 @@ impl PythonActorMeshImpl {
275280
// The callback raised some other exception, and there's
276281
// no way to handle it. Just exit the process anyways
277282
tracing::error!(
283+
name = "ActorMeshStatus",
284+
status = "SupervisionError::UnhandledFaultHook",
285+
actor_name = failure.mesh_name,
286+
event = %failure.event,
287+
rank = failure.rank,
278288
"unhandled event reached unhandled_fault_hook: {}, which raised an exception: {:?}. \
279289
Exiting the process with code 1",
280290
failure,
@@ -284,6 +294,11 @@ impl PythonActorMeshImpl {
284294
}
285295
} else {
286296
tracing::warn!(
297+
name = "ActorMeshStatus",
298+
status = "SupervisionError::UnhandledFaultHook",
299+
actor_name = failure.mesh_name,
300+
event = %failure.event,
301+
rank = failure.rank,
287302
"unhandled event reached unhandled_fault_hook: {}, but that function produced no exception or crash. Ignoring the error",
288303
failure
289304
);

0 commit comments

Comments
 (0)