Skip to content

Commit c2c388e

Browse files
authored
Add health check API (#83)
* Add health check API * remove kill time * log health check requests
1 parent 159caf4 commit c2c388e

File tree

7 files changed

+211
-17
lines changed

7 files changed

+211
-17
lines changed

Cargo.lock

Lines changed: 83 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

backend/Cargo.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,9 @@ tracing = "0.1"
1616
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
1717
alloy-primitives = { version = "0.8", features = ["serde"] }
1818
anyhow = "1.0.100"
19+
hyper = { version = "1.5", features = ["server", "http1"] }
20+
hyper-util = { version = "0.1", features = ["tokio"] }
21+
http-body-util = "0.1"
1922

2023
[dependencies.monad-exec-events]
2124
git = "https://github.com/category-labs/monad-bft"

backend/Dockerfile

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,8 @@ RUN cargo build --release --bin backend
2525

2626
# Expose WebSocket port
2727
EXPOSE 8443
28+
EXPOSE 443
2829

2930
# Set entrypoint with default server address for container
3031
ENTRYPOINT ["cargo", "run", "--release", "--bin", "backend", "--"]
31-
CMD ["--server-addr", "0.0.0.0:8443"]
32+
CMD ["--server-addr", "0.0.0.0:8443", "--health-server-addr", "0.0.0.0:443"]

backend/Dockerfile_build_and_publish

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,8 @@ COPY --from=builder /usr/src/app/restricted_filters.json /app/restricted_filters
4646

4747
# Expose WebSocket port
4848
EXPOSE 8443
49+
EXPOSE 443
4950

5051
# Set entrypoint using absolute path or relative to WORKDIR
5152
ENTRYPOINT ["./backend"]
52-
CMD ["--server-addr", "0.0.0.0:8443"]
53+
CMD ["--server-addr", "0.0.0.0:8443", "--health-server-addr", "0.0.0.0:443"]

backend/src/bin/backend.rs

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,9 @@ pub struct Cli {
1818

1919
#[arg(short, long, default_value = "127.0.0.1:3000")]
2020
server_addr: String,
21+
22+
#[arg(long, default_value = "127.0.0.1:4000")]
23+
health_server_addr: String,
2124
}
2225

2326
#[tokio::main]
@@ -33,6 +36,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
3336
let Cli {
3437
event_ring_path,
3538
server_addr,
39+
health_server_addr,
3640
} = Cli::parse();
3741

3842
// Resolve the event ring path
@@ -47,12 +51,13 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
4751
// Spawn the event listener thread
4852
let listener_handle = event_listener::run_event_listener(event_ring_path, event_sender);
4953

50-
// Parse server address
54+
// Parse server addresses
5155
let addr: SocketAddr = server_addr.parse()?;
56+
let health_addr: SocketAddr = health_server_addr.parse()?;
5257

5358
// Run both tasks and exit when either completes
5459
tokio::select! {
55-
result = server::run_websocket_server(addr, event_receiver) => {
60+
result = server::run_servers(addr, health_addr, event_receiver) => {
5661
warn!("WebSocket server stopped: {:?}", result);
5762
}
5863
_ = tokio::task::spawn_blocking(move || listener_handle.join()) => {

backend/src/lib/server.rs

Lines changed: 107 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,14 @@
11
use std::net::SocketAddr;
2+
use std::sync::atomic::{AtomicU64, Ordering};
3+
use std::sync::Arc;
4+
use std::time::{SystemTime, UNIX_EPOCH};
25

36
use alloy_primitives::{Address, B256};
7+
use http_body_util::Full;
8+
use hyper::body::Bytes;
9+
use hyper::server::conn::http1;
10+
use hyper::service::service_fn;
11+
use hyper::{Request, Response, StatusCode};
412
use futures_util::stream::SplitSink;
513
use futures_util::{stream::SplitStream, SinkExt, StreamExt};
614
use monad_exec_events::ExecEvent;
@@ -18,6 +26,9 @@ use super::event_filter::EventFilter;
1826
use super::event_listener::EventData;
1927
use super::serializable_event::SerializableEventData;
2028

29+
/// Stores the Unix timestamp (in seconds) of the last event received from the ring
30+
type LastEventTime = Arc<AtomicU64>;
31+
2132
#[derive(Debug, Clone, Serialize, Deserialize)]
2233
pub struct TopAccessesData {
2334
pub account: Vec<AccessEntry<Address>>,
@@ -188,6 +199,7 @@ async fn client_read_task(
188199
async fn run_event_forwarder_task(
189200
mut event_receiver: tokio::sync::mpsc::Receiver<EventData>,
190201
event_broadcast_sender: broadcast::Sender<EventDataOrMetrics>,
202+
last_event_time: LastEventTime,
191203
) {
192204
let mut account_accesses = TopKTracker::new(1_000);
193205
let mut storage_accesses = TopKTracker::new(1_000);
@@ -207,6 +219,13 @@ async fn run_event_forwarder_task(
207219
}
208220
let mut event_data = event_data.unwrap();
209221

222+
// Update last event timestamp for health check
223+
let now_secs = SystemTime::now()
224+
.duration_since(UNIX_EPOCH)
225+
.unwrap_or_default()
226+
.as_secs();
227+
last_event_time.store(now_secs, Ordering::Relaxed);
228+
210229
// Track txn_hash from TxnHeaderStart events
211230
if let EventName::TxnHeaderStart = event_data.event_name {
212231
if let ExecEvent::TxnHeaderStart { txn_index, txn_header_start, .. } = &event_data.payload {
@@ -341,20 +360,60 @@ async fn handle_connection(
341360
info!("WebSocket connection closed: {}", addr);
342361
}
343362

344-
pub async fn run_websocket_server(
345-
server_addr: SocketAddr,
346-
event_receiver: tokio::sync::mpsc::Receiver<EventData>,
347-
) -> Result<(), Box<dyn std::error::Error>> {
348-
// Create a broadcast channel for distributing events to all clients
349-
let (event_broadcast_sender, _) = broadcast::channel::<EventDataOrMetrics>(1_000_000);
363+
async fn health_handler(
364+
last_event_time: LastEventTime,
365+
) -> Result<Response<Full<Bytes>>, hyper::Error> {
366+
let now_secs = SystemTime::now()
367+
.duration_since(UNIX_EPOCH)
368+
.unwrap_or_default()
369+
.as_secs();
370+
let last_event = last_event_time.load(Ordering::Relaxed);
371+
let is_healthy = now_secs.saturating_sub(last_event) <= 10;
372+
373+
let body = if is_healthy {
374+
info!("Health check passed");
375+
r#"{"success": true}"#
376+
} else {
377+
warn!("Health check failed - last event time: {} seconds ago", now_secs.saturating_sub(last_event));
378+
r#"{"success": false}"#
379+
};
350380

351-
// Spawn a task to forward events from the mpsc channel to the broadcast channel
352-
let event_broadcast_sender_clone = event_broadcast_sender.clone();
353-
let _ = tokio::spawn(run_event_forwarder_task(
354-
event_receiver,
355-
event_broadcast_sender_clone,
356-
));
381+
Ok(Response::builder()
382+
.status(StatusCode::OK)
383+
.header("Content-Type", "application/json")
384+
.body(Full::new(Bytes::from(body)))
385+
.unwrap())
386+
}
357387

388+
async fn run_health_server(
389+
health_addr: SocketAddr,
390+
last_event_time: LastEventTime,
391+
) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
392+
let listener = tokio::net::TcpListener::bind(health_addr).await?;
393+
info!("Health server listening on: {}", health_addr);
394+
395+
loop {
396+
let (stream, _) = listener.accept().await?;
397+
let io = hyper_util::rt::TokioIo::new(stream);
398+
let last_event_time = last_event_time.clone();
399+
400+
tokio::spawn(async move {
401+
let service = service_fn(move |_req: Request<hyper::body::Incoming>| {
402+
let last_event_time = last_event_time.clone();
403+
async move { health_handler(last_event_time).await }
404+
});
405+
406+
if let Err(e) = http1::Builder::new().serve_connection(io, service).await {
407+
error!("Health server connection error: {}", e);
408+
}
409+
});
410+
}
411+
}
412+
413+
async fn run_websocket_server(
414+
server_addr: SocketAddr,
415+
event_broadcast_sender: broadcast::Sender<EventDataOrMetrics>,
416+
) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
358417
// Bind the TCP listener
359418
let listener = TcpListener::bind(&server_addr).await?;
360419
info!("WebSocket server listening on: {}", server_addr);
@@ -380,3 +439,39 @@ pub async fn run_websocket_server(
380439
}
381440
}
382441
}
442+
443+
pub async fn run_servers(
444+
server_addr: SocketAddr,
445+
health_server_addr: SocketAddr,
446+
event_receiver: tokio::sync::mpsc::Receiver<EventData>,
447+
) -> Result<(), Box<dyn std::error::Error>> {
448+
// Create shared state for tracking last event time (for health checks)
449+
let last_event_time: LastEventTime = Arc::new(AtomicU64::new(0));
450+
451+
// Create a broadcast channel for distributing events to all clients
452+
let (event_broadcast_sender, _) = broadcast::channel::<EventDataOrMetrics>(1_000_000);
453+
454+
// Spawn a task to forward events from the mpsc channel to the broadcast channel
455+
let event_broadcast_sender_clone = event_broadcast_sender.clone();
456+
let last_event_time_clone = last_event_time.clone();
457+
tokio::spawn(run_event_forwarder_task(
458+
event_receiver,
459+
event_broadcast_sender_clone,
460+
last_event_time_clone,
461+
));
462+
463+
// Spawn both servers and wait for either to complete
464+
let websocket_task = tokio::spawn(run_websocket_server(server_addr, event_broadcast_sender));
465+
let health_task = tokio::spawn(run_health_server(health_server_addr, last_event_time));
466+
467+
tokio::select! {
468+
result = websocket_task => {
469+
error!("WebSocket server task stopped: {:?}", result);
470+
}
471+
result = health_task => {
472+
error!("Health server task stopped: {:?}", result);
473+
}
474+
}
475+
476+
Ok(())
477+
}

docker-compose.yml

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,13 @@ services:
1616
# Optional: Persistent logs volume
1717
- logs:/var/log/eventwatch
1818
restart: unless-stopped
19-
command: ["--server-addr", "0.0.0.0:8443"]
19+
command: ["--server-addr", "0.0.0.0:8443", "--health-server-addr", "0.0.0.0:443"]
20+
healthcheck:
21+
test: ["CMD-SHELL", "curl -sf http://0.0.0.0:443/health | grep -q '\"success\": true'"]
22+
interval: 10s
23+
timeout: 5s
24+
retries: 3
25+
start_period: 10s
2026
logging:
2127
driver: json-file
2228
options:

0 commit comments

Comments
 (0)