Skip to content

Commit f6c131e

Browse files
fix(cat-gateway): Improve panic logging (#3471)
* Use tracing::error instead of println for logging panic * Custom middleware for catching panics * Fix Clippy lints * Add missing documentation * More documentation: * added `/panic` endpoint as exception for `DatabaseConnectionImpl` * Open ports for the gateway container * Fix * Use cargo profile option for stripping the backend binary instead of the strip tool * fix spellcheck * Restore args * Remove stripping from Cargo config * Update the rust-ci version --------- Co-authored-by: Mr-Leshiy <[email protected]>
1 parent 8eb812e commit f6c131e

File tree

8 files changed

+172
-131
lines changed

8 files changed

+172
-131
lines changed

catalyst-gateway/Earthfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
VERSION 0.8
22

3-
IMPORT github.com/input-output-hk/catalyst-ci/earthly/rust:v3.5.20 AS rust-ci
3+
IMPORT github.com/input-output-hk/catalyst-ci/earthly/rust:v3.5.21 AS rust-ci
44
IMPORT ../ AS repo-ci
55
IMPORT github.com/input-output-hk/catalyst-voices/catalyst-gateway:main AS cat-gateway-main
66

catalyst-gateway/bin/src/service/poem_service.rs

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
use cardano_chain_follower::Network;
66
use poem::{
77
listener::TcpListener,
8-
middleware::{CatchPanic, Compression, Cors, SensitiveHeader},
8+
middleware::{Compression, Cors, SensitiveHeader},
99
web::CompressionLevel,
1010
Endpoint, EndpointExt, Route,
1111
};
@@ -19,8 +19,12 @@ use crate::{
1919
api::mk_api,
2020
docs::{docs, favicon},
2121
utilities::{
22-
catch_panic::{panic_endpoint, set_panic_hook, ServicePanicHandler},
23-
middleware::{metrics_updater::MetricsUpdaterMiddleware, tracing_mw::Tracing},
22+
middleware::{
23+
catch_panic::{set_panic_hook, CatchPanicMiddleware},
24+
metrics_updater::MetricsUpdaterMiddleware,
25+
tracing_mw::Tracing,
26+
},
27+
panic::panic_endpoint,
2428
},
2529
},
2630
settings::Settings,
@@ -48,7 +52,7 @@ fn mk_app(base_route: Option<Route>) -> impl Endpoint {
4852
.nest("/favicon.ico", favicon())
4953
.with(Cors::new())
5054
.with(Compression::new().with_quality(CompressionLevel::Fastest))
51-
.with(CatchPanic::new().with_handler(ServicePanicHandler))
55+
.with(CatchPanicMiddleware::new())
5256
.with(Tracing)
5357
.with(DatabaseConnectionCheck)
5458
.with(CatGatewayInfo)

catalyst-gateway/bin/src/service/utilities/catch_panic.rs

Lines changed: 0 additions & 124 deletions
This file was deleted.
Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
//! A middleware to catch and log panics.
2+
3+
use std::{any::Any, backtrace::Backtrace, cell::RefCell, panic::AssertUnwindSafe};
4+
5+
use futures::FutureExt;
6+
use panic_message::panic_message;
7+
use poem::{
8+
http::{HeaderMap, Method, StatusCode, Uri},
9+
Endpoint, IntoResponse, Middleware, Request, Response,
10+
};
11+
use poem_openapi::payload::Json;
12+
use tracing::{debug, error};
13+
14+
use crate::{
15+
service::{
16+
common::responses::code_500_internal_server_error::InternalServerError,
17+
utilities::health::{get_live_counter, inc_live_counter, set_not_live},
18+
},
19+
settings::Settings,
20+
};
21+
22+
// Allows us to catch the backtrace so we can include it in logs.
23+
thread_local! {
24+
static BACKTRACE: RefCell<Option<String>> = const { RefCell::new(None) };
25+
static LOCATION: RefCell<Option<String>> = const { RefCell::new(None) };
26+
}
27+
28+
/// Sets a custom panic hook to capture the Backtrace and Panic Location for logging
29+
/// purposes. This hook gets called BEFORE we catch it. So the thread local variables
30+
/// stored here are valid when processing the panic capture.
31+
pub(crate) fn set_panic_hook() {
32+
std::panic::set_hook(Box::new(|panic_info| {
33+
// Get the backtrace and format it.
34+
let raw_trace = Backtrace::force_capture();
35+
let trace = format!("{raw_trace}");
36+
BACKTRACE.with(move |b| b.borrow_mut().replace(trace));
37+
38+
// Get the location and format it.
39+
let location = match panic_info.location() {
40+
Some(location) => format!("{location}"),
41+
None => "Unknown".to_string(),
42+
};
43+
LOCATION.with(move |l| l.borrow_mut().replace(location));
44+
}));
45+
}
46+
47+
/// A middleware for catching and logging panics transforming them to the 500 error
48+
/// response. The panic will not crash the service, but it becomes not live after
49+
/// exceeding the `Settings::service_live_counter_threshold()` value. That should cause
50+
/// Kubernetes to restart the service.
51+
pub struct CatchPanicMiddleware {}
52+
53+
impl CatchPanicMiddleware {
54+
/// Creates a new middleware instance.
55+
pub fn new() -> Self {
56+
Self {}
57+
}
58+
}
59+
60+
impl<E: Endpoint> Middleware<E> for CatchPanicMiddleware {
61+
type Output = CatchPanicEndpoint<E>;
62+
63+
fn transform(
64+
&self,
65+
ep: E,
66+
) -> Self::Output {
67+
CatchPanicEndpoint { inner: ep }
68+
}
69+
}
70+
71+
/// An endpoint for the `CatchPanicMiddleware` middleware.
72+
pub struct CatchPanicEndpoint<E> {
73+
/// An inner endpoint.
74+
inner: E,
75+
}
76+
77+
impl<E: Endpoint> Endpoint for CatchPanicEndpoint<E> {
78+
type Output = Response;
79+
80+
async fn call(
81+
&self,
82+
req: Request,
83+
) -> poem::Result<Self::Output> {
84+
// Preserve all the data that we want to potentially log because a request is consumed.
85+
let method = req.method().clone();
86+
let uri = req.uri().clone();
87+
let headers = req.headers().clone();
88+
89+
match AssertUnwindSafe(self.inner.call(req)).catch_unwind().await {
90+
Ok(resp) => resp.map(IntoResponse::into_response),
91+
Err(err) => Ok(panic_response(&err, &method, &uri, &headers)),
92+
}
93+
}
94+
}
95+
96+
/// Converts a panic to a response.
97+
fn panic_response(
98+
err: &Box<dyn Any + Send + 'static>,
99+
method: &Method,
100+
uri: &Uri,
101+
headers: &HeaderMap,
102+
) -> Response {
103+
// Increment the counter used for liveness checks.
104+
inc_live_counter();
105+
106+
let current_count = get_live_counter();
107+
debug!(
108+
live_counter = current_count,
109+
"Handling service panic response"
110+
);
111+
112+
// If current count is above the threshold, then flag the system as NOT live.
113+
if current_count > Settings::service_live_counter_threshold() {
114+
set_not_live();
115+
}
116+
117+
let server_err = InternalServerError::new(None);
118+
119+
// Get the unique identifier for this panic, so we can find it in the logs.
120+
let panic_identifier = server_err.id().to_string();
121+
122+
// Get the message from the panic as best we can.
123+
let err_msg = panic_message(err);
124+
125+
// This is the location of the panic.
126+
let location = match LOCATION.with(|l| l.borrow_mut().take()) {
127+
Some(location) => location,
128+
None => "Unknown".to_string(),
129+
};
130+
131+
// This is the backtrace of the panic.
132+
let backtrace = match BACKTRACE.with(|b| b.borrow_mut().take()) {
133+
Some(backtrace) => backtrace,
134+
None => "Unknown".to_string(),
135+
};
136+
137+
error!(
138+
panic = true,
139+
backtrace = backtrace,
140+
location = location,
141+
id = panic_identifier,
142+
message = err_msg,
143+
method = ?method,
144+
uri = ?uri,
145+
headers = ?headers,
146+
);
147+
148+
let mut resp = Json(server_err).into_response();
149+
resp.set_status(StatusCode::INTERNAL_SERVER_ERROR);
150+
resp
151+
}

catalyst-gateway/bin/src/service/utilities/middleware/db_check.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ impl<E: Endpoint> Endpoint for DatabaseConnectionImpl<E> {
3737

3838
// TODO: find a better way to filter URI paths
3939
let is_health_endpoint = req_path.starts_with("/api/v1/health/");
40-
let is_metrics_endpoint = req_path == "/metrics";
40+
let is_metrics_endpoint = req_path == "/metrics" || req_path == "/panic";
4141

4242
if !(is_health_endpoint || is_metrics_endpoint) {
4343
if !event_db_is_live() {

catalyst-gateway/bin/src/service/utilities/middleware/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
//! Custom POEM Middleware for this service.
22
3+
pub(crate) mod catch_panic;
34
pub(crate) mod db_check;
45
pub(crate) mod metrics_updater;
56
pub(crate) mod node_info;

catalyst-gateway/bin/src/service/utilities/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
//! `API` Utility operations
22
pub(crate) mod cache;
3-
pub(crate) mod catch_panic;
43
pub(crate) mod convert;
54
pub(crate) mod health;
65
pub(crate) mod middleware;
76
pub(crate) mod net;
7+
pub(crate) mod panic;
88

99
use anyhow::{bail, Result};
1010
// use pallas::ledger::addresses::Network as PallasNetwork;
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
//! An implementation of the panic endpoint.
2+
3+
/// Implementation of the special `/panic` endpoint,
4+
/// which is used only for testing purposes and enabled for all networks except `Mainnet`
5+
#[poem::handler]
6+
#[allow(clippy::panic)]
7+
pub(crate) fn panic_endpoint() {
8+
panic!("Intentional panicking")
9+
}

0 commit comments

Comments
 (0)