Skip to content

Commit f4deec6

Browse files
pzhan9meta-codesync[bot]
authored andcommitted
capture panic message inside panic_handler (#1890)
Summary: Pull Request resolved: #1890 The main motivation is by making this change, we can log panic message here too: https://www.internalfb.com/code/fbsource/[4a662228cd8bdf2bdf9b760e705cc9958f85e55c]/fbcode/monarch/hyperactor/src/panic_handler.rs?lines=47 it currently does not and caused quite some confusion for me, since I thought there is a bug in the panic catching logic. Reviewed By: mariusae Differential Revision: D87086712 fbshipit-source-id: 975c2f69dae43a7a3c74a8ab5976a6efd2afcedf
1 parent a643793 commit f4deec6

File tree

4 files changed

+107
-55
lines changed

4 files changed

+107
-55
lines changed

controller/src/lib.rs

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -643,6 +643,7 @@ mod tests {
643643
use hyperactor::mailbox::PortHandle;
644644
use hyperactor::mailbox::PortReceiver;
645645
use hyperactor::message::IndexedErasedUnbound;
646+
use hyperactor::panic_handler;
646647
use hyperactor::proc::Proc;
647648
use hyperactor::reference::GangId;
648649
use hyperactor::reference::ProcId;
@@ -1843,6 +1844,9 @@ mod tests {
18431844
// times out (both internal and external).
18441845
#[cfg_attr(not(fbcode_build), ignore)]
18451846
async fn test_supervision_fault() {
1847+
// Need this custom hook to store panic backtrace in task_local.
1848+
panic_handler::set_panic_hook();
1849+
18461850
// Start system actor.
18471851
let timeout: Duration = Duration::from_secs(6);
18481852
let server_handle = System::serve(
@@ -1939,6 +1943,10 @@ mod tests {
19391943
let Exception::Failure(err) = result.1.unwrap().unwrap_err() else {
19401944
panic!("Expected Failure exception");
19411945
};
1942-
assert!(err.backtrace.contains("some random failure"));
1946+
assert!(
1947+
err.backtrace.contains("some random failure"),
1948+
"got: {}",
1949+
err.backtrace
1950+
);
19431951
}
19441952
}

hyperactor/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@
6666
#![feature(panic_update_hook)]
6767
#![feature(type_alias_impl_trait)]
6868
#![feature(trait_alias)]
69+
#![feature(panic_payload_as_str)]
6970
#![deny(missing_docs)]
7071

7172
pub mod accum;

hyperactor/src/panic_handler.rs

Lines changed: 93 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -12,30 +12,86 @@
1212
use std::backtrace::Backtrace;
1313
use std::cell::RefCell;
1414
use std::future::Future;
15-
use std::ops::Deref;
1615
use std::panic;
1716

17+
/// A struct to store the message and backtrace from a panic.
18+
pub(crate) struct PanicInfo {
19+
/// The message from the panic.
20+
message: String,
21+
/// The location where the panic occurred.
22+
location: Option<PanicLocation>,
23+
/// The backtrace from the panic.
24+
backtrace: Backtrace,
25+
}
26+
27+
impl std::fmt::Display for PanicInfo {
28+
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
29+
write!(f, "panic at ")?;
30+
match &self.location {
31+
Some(loc) => write!(f, "{}", loc)?,
32+
None => write!(f, "unavailable")?,
33+
}
34+
write!(f, ": {}\n{}", self.message, self.backtrace)
35+
}
36+
}
37+
38+
/// A struct to store location information from a panic with owned data
39+
#[derive(Clone, Debug)]
40+
struct PanicLocation {
41+
file: String,
42+
line: u32,
43+
column: u32,
44+
}
45+
46+
impl From<&panic::Location<'_>> for PanicLocation {
47+
fn from(loc: &panic::Location<'_>) -> Self {
48+
Self {
49+
file: loc.file().to_string(),
50+
line: loc.line(),
51+
column: loc.column(),
52+
}
53+
}
54+
}
55+
56+
impl std::fmt::Display for PanicLocation {
57+
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
58+
write!(f, "{}:{}:{}", self.file, self.line, self.column)
59+
}
60+
}
61+
1862
tokio::task_local! {
1963
/// A task_local variable to store the backtrace from a panic, so it can be
2064
/// retrieved later.
21-
static BACKTRACE: RefCell<Option<String>>;
65+
static BACKTRACE: RefCell<Option<PanicInfo>>;
2266
}
2367

2468
/// Call this from the main method of your application, and use it in conjunction
25-
/// with [[with_backtrace_tracking]] and [[take_panic_backtrace]], in order to
69+
/// with [[with_backtrace_tracking]] and [[take_panic_info]], in order to
2670
/// capture the backtrace from a panic.
2771
pub fn set_panic_hook() {
2872
panic::update_hook(move |prev, info| {
29-
// Ignore AccessError, which would happen if panic occurred outside of
30-
// BACKTRACE's scope.
3173
let backtrace = Backtrace::force_capture();
32-
let loc = info.location().map_or_else(
33-
|| "unavailable".to_owned(),
34-
|loc: &panic::Location<'_>| format!("{}:{}:{}", loc.file(), loc.line(), loc.column()),
35-
);
74+
75+
// Extract the panic message from the payload
76+
let panic_msg = if let Some(s) = info.payload_as_str() {
77+
s.to_string()
78+
} else {
79+
"panic message was not a string".to_string()
80+
};
81+
82+
let location = info.location().map(PanicLocation::from);
83+
let loc_str = location
84+
.as_ref()
85+
.map_or_else(|| "unavailable".to_owned(), |l| l.to_string());
86+
tracing::error!("stacktrace"=%backtrace, "panic at {loc_str}: {panic_msg}");
87+
3688
let _result = BACKTRACE.try_with(|entry| match entry.try_borrow_mut() {
3789
Ok(mut entry_ref) => {
38-
*entry_ref = Some(format!("panicked at {loc}\n{backtrace}"));
90+
*entry_ref = Some(PanicInfo {
91+
message: panic_msg,
92+
location,
93+
backtrace,
94+
});
3995
}
4096
Err(borrow_mut_error) => {
4197
eprintln!(
@@ -44,7 +100,6 @@ pub fn set_panic_hook() {
44100
);
45101
}
46102
});
47-
tracing::error!("stacktrace"=%backtrace, "panic at {loc}");
48103

49104
// Execute the default hood to preserve the default behavior.
50105
prev(info);
@@ -53,7 +108,7 @@ pub fn set_panic_hook() {
53108

54109
/// Set a task_local variable for this future f, so any panic occurred in f can
55110
/// be stored and retrieved later.
56-
pub async fn with_backtrace_tracking<F>(f: F) -> F::Output
111+
pub(crate) async fn with_backtrace_tracking<F>(f: F) -> F::Output
57112
where
58113
F: Future,
59114
{
@@ -62,20 +117,21 @@ where
62117

63118
/// Take the backtrace from the task_local variable, and reset the task_local to
64119
/// None. Return error if the backtrace is not stored, or cannot be retrieved.
65-
pub fn take_panic_backtrace() -> Result<String, anyhow::Error> {
66-
BACKTRACE.try_with(|entry| {
67-
entry.try_borrow_mut().map(|mut entry_ref| {
68-
let result = match entry_ref.deref() {
69-
Some(bt) => Ok(bt.to_string()),
70-
None => Err(anyhow::anyhow!("nothing is stored in task_local")),
71-
};
72-
// Clear the task_local because the backtrace has been retrieve.
73-
if result.is_ok() {
74-
*entry_ref = None;
75-
}
76-
result
120+
pub(crate) fn take_panic_info() -> Result<PanicInfo, anyhow::Error> {
121+
BACKTRACE
122+
.try_with(|entry| {
123+
entry
124+
.try_borrow_mut()
125+
.map_err(|e| anyhow::anyhow!("failed to borrow task_local: {:?}", e))
126+
.and_then(|mut entry_ref| {
127+
// Use take because we want to clear the task_local after
128+
// the panic info has been retrieve.
129+
entry_ref
130+
.take()
131+
.ok_or_else(|| anyhow::anyhow!("nothing is stored in task_local"))
132+
})
77133
})
78-
})??
134+
.map_err(|e| anyhow::anyhow!("failed to access task_local: {:?}", e))?
79135
}
80136

81137
#[cfg(test)]
@@ -99,16 +155,16 @@ mod tests {
99155
with_backtrace_tracking(async {
100156
execute_panic().await;
101157
// Verify backtrace can be taken successfully.
102-
assert!(take_panic_backtrace().is_ok());
158+
assert!(take_panic_info().is_ok());
103159
// Cannot take backtrace again because task_local is reset in the
104160
// previous take.
105-
assert!(take_panic_backtrace().is_err());
161+
assert!(take_panic_info().is_err());
106162
})
107163
.await;
108164

109165
// Cannot get backtrace because this is out of the set task_local's
110166
// scope.
111-
assert!(take_panic_backtrace().is_err());
167+
assert!(take_panic_info().is_err());
112168
}
113169

114170
#[tokio::test]
@@ -117,7 +173,7 @@ mod tests {
117173
async {
118174
execute_panic().await;
119175
// Cannot get backtrace because task_local is not set.
120-
assert!(take_panic_backtrace().is_err());
176+
assert!(take_panic_info().is_err());
121177
}
122178
.await;
123179
}
@@ -128,7 +184,7 @@ mod tests {
128184
with_backtrace_tracking(async {
129185
execute_panic().await;
130186
// Cannot get backtrace because the custom panic hook is not set.
131-
assert!(take_panic_backtrace().is_err());
187+
assert!(take_panic_info().is_err());
132188
})
133189
.await;
134190
}
@@ -143,13 +199,11 @@ mod tests {
143199
.await;
144200
assert!(result.is_err());
145201
if backtrace_captured {
146-
assert!(
147-
take_panic_backtrace()
148-
.unwrap()
149-
.contains("verify_inner_panic")
150-
);
202+
let info = take_panic_info().unwrap();
203+
assert_eq!(info.message, "wow!");
204+
assert!(info.backtrace.to_string().contains("verify_inner_panic"));
151205
} else {
152-
assert!(take_panic_backtrace().is_err());
206+
assert!(take_panic_info().is_err());
153207
}
154208
}
155209

@@ -169,11 +223,9 @@ mod tests {
169223
assert!(result.is_ok());
170224

171225
// Verify the outer task can get its own backtrace.
172-
assert!(
173-
take_panic_backtrace()
174-
.unwrap()
175-
.contains("test_nested_tasks")
176-
);
226+
let info = take_panic_info().unwrap();
227+
assert_eq!(info.message, "boom!");
228+
assert!(info.backtrace.to_string().contains("test_nested_tasks"));
177229
})
178230
.await;
179231
}

hyperactor/src/proc.rs

Lines changed: 4 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1290,23 +1290,14 @@ impl<A: Actor> Instance<A> {
12901290
.await
12911291
{
12921292
Ok(result) => result,
1293-
Err(err) => {
1294-
// This is only the error message. Backtrace is not included.
1293+
Err(_) => {
12951294
did_panic = true;
1296-
let err_msg = err
1297-
.downcast_ref::<&str>()
1298-
.copied()
1299-
.or_else(|| err.downcast_ref::<String>().map(|s| s.as_str()))
1300-
.unwrap_or("panic cannot be downcasted");
1301-
1302-
let backtrace = panic_handler::take_panic_backtrace()
1295+
let panic_info = panic_handler::take_panic_info()
1296+
.map(|info| info.to_string())
13031297
.unwrap_or_else(|e| format!("Cannot take backtrace due to: {:?}", e));
13041298
Err(ActorError::new(
13051299
self.self_id(),
1306-
ActorErrorKind::panic(anyhow::anyhow!(
1307-
"{}
1308-
{}", err_msg, backtrace
1309-
)),
1300+
ActorErrorKind::panic(anyhow::anyhow!(panic_info)),
13101301
))
13111302
}
13121303
};

0 commit comments

Comments
 (0)