Skip to content

Commit 44c55e6

Browse files
authored
Manage sync threads failures (#49)
This PR makes bunch of changes to ways how local sync and s3 sync threads are managed inside parseable. The goal is to make parseable main thread be aware of unwanted failure inside either of these dedicated sync thread. Additionally we want to have control over life cycle of these threads and have them stop on command, This can be later explored in future if we want to have graceful shutdown an also to guarantee more data consistency in case of failure. This solution works something like this: Sync threads are spawned with pair of one shot channels (named with suffix inbox and outbox) for communication with main thread. inbox is sender type which can be used to stop the respective thread by breaking its scheduler loop. outbox is receiver type which is polled inside the main thread's looped select. So that whenever we receive a message from thread we stop. scheduler is ran inside loop which runs pending tasks and polls the channel for any message from main thread ( if any then stop the thread ) All of this is wrapped by a catch unwind. So in case of an unwinding because of panic there is a chance to let main thread know and handle accordingly. This is added as a safeguard for now but later we need to verify that local_sync and s3_sync don't have anything that can panic. Fixes #43
1 parent 4e539de commit 44c55e6

File tree

3 files changed

+105
-23
lines changed

3 files changed

+105
-23
lines changed

server/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ structopt = { version = "0.3.25" }
4444
sysinfo = "0.20.5"
4545
thiserror = "1"
4646
tokio-stream = "0.1.8"
47+
tokio = { version = "1.13.1", default-features = false, features=["sync", "macros"] }
4748
clokwerk = "0.4.0-rc1"
4849
actix-web-static-files = "4.0"
4950
static-files = "0.2.1"

server/src/main.rs

Lines changed: 103 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,17 @@ use actix_web::{middleware, web, App, HttpServer};
2222
use actix_web_httpauth::extractors::basic::BasicAuth;
2323
use actix_web_httpauth::middleware::HttpAuthentication;
2424
use actix_web_static_files::ResourceFiles;
25-
use clokwerk::{AsyncScheduler, TimeUnits};
25+
use clokwerk::{AsyncScheduler, Scheduler, TimeUnits};
2626
use log::warn;
2727
use openssl::ssl::{SslAcceptor, SslFiletype, SslMethod};
2828

2929
include!(concat!(env!("OUT_DIR"), "/generated.rs"));
3030

31-
use std::thread;
31+
use std::panic::{catch_unwind, AssertUnwindSafe};
32+
use std::thread::{self, JoinHandle};
33+
use std::time::Duration;
34+
use tokio::sync::oneshot;
35+
use tokio::sync::oneshot::error::TryRecvError;
3236

3337
mod banner;
3438
mod error;
@@ -63,33 +67,110 @@ async fn main() -> anyhow::Result<()> {
6367
if let Err(e) = metadata::STREAM_INFO.load(&storage).await {
6468
warn!("could not populate local metadata. {:?}", e);
6569
}
66-
thread::spawn(sync);
67-
run_http().await?;
6870

69-
Ok(())
70-
}
71+
let (localsync_handler, mut localsync_outbox, localsync_inbox) = run_local_sync();
72+
let (mut s3sync_handler, mut s3sync_outbox, mut s3sync_inbox) = s3_sync();
7173

72-
#[actix_web::main]
73-
async fn sync() {
74-
let mut scheduler = AsyncScheduler::new();
75-
scheduler
76-
.every((storage::LOCAL_SYNC_INTERVAL as u32).seconds())
77-
.run(|| async {
78-
if let Err(e) = S3::new().local_sync().await {
79-
warn!("failed to sync local data. {:?}", e);
74+
let app = run_http();
75+
tokio::pin!(app);
76+
loop {
77+
tokio::select! {
78+
e = &mut app => {
79+
// actix server finished .. stop other threads and stop the server
80+
s3sync_inbox.send(()).unwrap_or(());
81+
localsync_inbox.send(()).unwrap_or(());
82+
localsync_handler.join().unwrap_or(());
83+
s3sync_handler.join().unwrap_or(());
84+
return e
85+
},
86+
_ = &mut localsync_outbox => {
87+
// crash the server if localsync fails for any reason
88+
// panic!("Local Sync thread died. Server will fail now!")
89+
return Err(anyhow::Error::msg("Failed to sync local data to disc. This can happen due to critical error in disc or environment. Please restart the Parseable server.\n\nJoin us on Parseable Slack if the issue persists after restart : https://launchpass.com/parseable"))
90+
},
91+
_ = &mut s3sync_outbox => {
92+
// s3sync failed, this is recoverable by just starting s3sync thread again
93+
s3sync_handler.join().unwrap_or(());
94+
(s3sync_handler, s3sync_outbox, s3sync_inbox) = s3_sync();
8095
}
96+
};
97+
}
98+
}
99+
100+
fn s3_sync() -> (JoinHandle<()>, oneshot::Receiver<()>, oneshot::Sender<()>) {
101+
let (outbox_tx, outbox_rx) = oneshot::channel::<()>();
102+
let (inbox_tx, inbox_rx) = oneshot::channel::<()>();
103+
let mut inbox_rx = AssertUnwindSafe(inbox_rx);
104+
let handle = thread::spawn(move || {
105+
let res = catch_unwind(move || {
106+
let rt = actix_web::rt::System::new();
107+
rt.block_on(async {
108+
let mut scheduler = AsyncScheduler::new();
109+
scheduler
110+
.every((CONFIG.parseable.upload_interval as u32).seconds())
111+
.run(|| async {
112+
if let Err(e) = S3::new().s3_sync().await {
113+
warn!("failed to sync local data with object store. {:?}", e);
114+
}
115+
});
116+
117+
loop {
118+
scheduler.run_pending().await;
119+
match AssertUnwindSafe(|| inbox_rx.try_recv())() {
120+
Ok(_) => break,
121+
Err(TryRecvError::Empty) => continue,
122+
Err(TryRecvError::Closed) => {
123+
// should be unreachable but breaking anyways
124+
break;
125+
}
126+
}
127+
}
128+
})
81129
});
82-
scheduler
83-
.every((CONFIG.parseable.upload_interval as u32).seconds())
84-
.run(|| async {
85-
if let Err(e) = S3::new().s3_sync().await {
86-
warn!("failed to sync local data with object store. {:?}", e);
130+
131+
if res.is_err() {
132+
outbox_tx.send(()).unwrap();
133+
}
134+
});
135+
136+
(handle, outbox_rx, inbox_tx)
137+
}
138+
139+
fn run_local_sync() -> (JoinHandle<()>, oneshot::Receiver<()>, oneshot::Sender<()>) {
140+
let (outbox_tx, outbox_rx) = oneshot::channel::<()>();
141+
let (inbox_tx, inbox_rx) = oneshot::channel::<()>();
142+
let mut inbox_rx = AssertUnwindSafe(inbox_rx);
143+
let handle = thread::spawn(move || {
144+
let res = catch_unwind(move || {
145+
let mut scheduler = Scheduler::new();
146+
scheduler
147+
.every((storage::LOCAL_SYNC_INTERVAL as u32).seconds())
148+
.run(move || {
149+
if let Err(e) = S3::new().local_sync() {
150+
warn!("failed to sync local data. {:?}", e);
151+
}
152+
});
153+
154+
loop {
155+
thread::sleep(Duration::from_millis(50));
156+
scheduler.run_pending();
157+
match AssertUnwindSafe(|| inbox_rx.try_recv())() {
158+
Ok(_) => break,
159+
Err(TryRecvError::Empty) => continue,
160+
Err(TryRecvError::Closed) => {
161+
// should be unreachable but breaking anyways
162+
break;
163+
}
164+
}
87165
}
88166
});
89167

90-
loop {
91-
scheduler.run_pending().await;
92-
}
168+
if res.is_err() {
169+
outbox_tx.send(()).unwrap();
170+
}
171+
});
172+
173+
(handle, outbox_rx, inbox_tx)
93174
}
94175

95176
async fn validator(

server/src/storage.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ pub trait ObjectStorage: Sync + 'static {
6363
query: &Query,
6464
results: &mut Vec<RecordBatch>,
6565
) -> Result<(), ObjectStorageError>;
66-
async fn local_sync(&self) -> io::Result<()> {
66+
fn local_sync(&self) -> io::Result<()> {
6767
// If the local data path doesn't exist yet, return early.
6868
// This method will be called again after next ticker interval
6969
if !Path::new(&CONFIG.parseable.local_disk_path).exists() {

0 commit comments

Comments
 (0)