Skip to content

Commit 2fc4680

Browse files
dulinrileymeta-codesync[bot]
authored andcommitted
Move critical log of exceptions on endpoints to monarch file (#1934)
Summary: Pull Request resolved: #1934 Move the log of endpoint exceptions from the user logger (logging in python) to use monarch's tracing logger which goes to a separate file. Log as info because it's only an error to the user's perspective, not from monarch's perspective. Reviewed By: zdevito Differential Revision: D87459440 fbshipit-source-id: 586a38c2ebeb19efa8596cdd362e6b84b0e5af4b
1 parent ed514e4 commit 2fc4680

File tree

3 files changed

+56
-16
lines changed

3 files changed

+56
-16
lines changed

monarch_hyperactor/src/v1/logging.rs

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ use ndslice::View;
2424
use pyo3::Bound;
2525
use pyo3::prelude::*;
2626
use pyo3::types::PyModule;
27+
use pyo3::types::PyString;
2728

2829
use crate::context::PyInstance;
2930
use crate::instance_dispatch;
@@ -416,13 +417,53 @@ impl Drop for LoggingMeshClient {
416417
}
417418
}
418419

420+
/// Turns a python exception into a string with a traceback. If the traceback doesn't
421+
/// exist or can't be formatted, returns just the exception message.
422+
fn format_traceback<'py>(py: Python<'py>, err: PyErr) -> String {
423+
let traceback = err.traceback(py);
424+
if traceback.is_some() {
425+
let inner = || -> PyResult<String> {
426+
let formatted = py
427+
.import("traceback")?
428+
.call_method1("format_exception", (err.clone_ref(py),))?;
429+
Ok(PyString::new(py, "")
430+
.call_method1("join", (formatted,))?
431+
.to_string())
432+
};
433+
match inner() {
434+
Ok(s) => s,
435+
Err(e) => format!("{}: no traceback {}", err, e),
436+
}
437+
} else {
438+
err.to_string()
439+
}
440+
}
441+
442+
#[pyfunction]
443+
fn log_endpoint_exception<'py>(py: Python<'py>, e: PyObject, endpoint: PyObject) {
444+
let pyerr = PyErr::from_value(e.into_bound(py));
445+
let exception_str = format_traceback(py, pyerr);
446+
let endpoint = endpoint.into_bound(py).to_string();
447+
tracing::info!(
448+
%endpoint,
449+
"exception occurred in endpoint: {}",
450+
exception_str,
451+
);
452+
}
453+
419454
/// Register the Python-facing types for this module.
420455
///
421456
/// `pyo3` calls this when building `monarch._rust_bindings...`. We
422457
/// expose `LoggingMeshClient` so that Python can construct it and
423458
/// call its methods (`spawn`, `set_mode`, `flush`, ...).
424459
pub fn register_python_bindings(module: &Bound<'_, PyModule>) -> PyResult<()> {
425460
module.add_class::<LoggingMeshClient>()?;
461+
let log_endpoint_exception = wrap_pyfunction!(log_endpoint_exception, module.py())?;
462+
log_endpoint_exception.setattr(
463+
"__module__",
464+
"monarch._rust_bindings.monarch_hyperactor.v1.logging",
465+
)?;
466+
module.add_function(log_endpoint_exception)?;
426467
Ok(())
427468
}
428469

python/monarch/_rust_bindings/monarch_hyperactor/v1/logging.pyi

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,3 +32,5 @@ class LoggingMeshClient:
3232
level: int,
3333
) -> None: ...
3434
def flush(self, instance: Instance) -> PythonTask[None]: ...
35+
36+
def log_endpoint_exception(e: Exception, endpoint: str | None) -> None: ...

python/monarch/_src/actor/actor_mesh.py

Lines changed: 13 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@
7272
Region,
7373
Shape,
7474
)
75+
from monarch._rust_bindings.monarch_hyperactor.v1.logging import log_endpoint_exception
7576
from monarch._rust_bindings.monarch_hyperactor.value_mesh import (
7677
ValueMesh as HyValueMesh,
7778
)
@@ -970,6 +971,7 @@ async def handle(
970971
local_state: Iterable[Any],
971972
response_port: "PortProtocol[Any]",
972973
) -> None:
974+
method_name = None
973975
MESSAGES_HANDLED.add(1)
974976
# response_port can be None. If so, then sending to port will drop the response,
975977
# and raise any exceptions to the caller.
@@ -1000,14 +1002,15 @@ async def handle(
10001002
self._saved_error = ActorError(
10011003
e, f"Remote actor {Class}.__init__ call failed."
10021004
)
1003-
raise e
1005+
raise
10041006
response_port.send(None)
10051007
return None
10061008
case MethodSpecifier.ReturnsResponse(name=method_name):
10071009
pass
10081010
case MethodSpecifier.ExplicitPort(name=method_name):
10091011
args = (response_port, *args)
10101012
response_port = DroppingPort()
1013+
assert isinstance(method_name, str)
10111014

10121015
if self.instance is None:
10131016
# This could happen because of the following reasons. Both
@@ -1036,22 +1039,15 @@ async def handle(
10361039
the_method = functools.partial(the_method._method, self.instance)
10371040

10381041
if inspect.iscoroutinefunction(the_method):
1039-
try:
1040-
if should_instrument:
1041-
with TRACER.start_as_current_span(
1042-
method_name,
1043-
attributes={"actor_id": str(ctx.actor_instance.actor_id)},
1044-
):
1045-
result = await the_method(*args, **kwargs)
1046-
else:
1042+
if should_instrument:
1043+
with TRACER.start_as_current_span(
1044+
method_name,
1045+
attributes={"actor_id": str(ctx.actor_instance.actor_id)},
1046+
):
10471047
result = await the_method(*args, **kwargs)
1048-
self._maybe_exit_debugger()
1049-
except Exception as e:
1050-
logging.critical(
1051-
"Unhandled exception in actor endpoint",
1052-
exc_info=e,
1053-
)
1054-
raise e
1048+
else:
1049+
result = await the_method(*args, **kwargs)
1050+
self._maybe_exit_debugger()
10551051
else:
10561052
with fake_sync_state():
10571053
if should_instrument:
@@ -1066,6 +1062,7 @@ async def handle(
10661062

10671063
response_port.send(result)
10681064
except Exception as e:
1065+
log_endpoint_exception(e, method_name)
10691066
self._post_mortem_debug(e.__traceback__)
10701067
response_port.exception(ActorError(e))
10711068
except BaseException as e:

0 commit comments

Comments
 (0)