Skip to content

Commit 788f02b

Browse files
authored
Change local cache directory structure (#202)
This PR introduces changes to local directory and removes tmp directory All conversion is still carried out in s3 sync cycle. StorageDir is used as primary class for determining what is the data path for a log stream. RecordBatch file is signified by extension .arrows. Also remove startup_sync step as it is not required.
1 parent b750684 commit 788f02b

File tree

5 files changed

+121
-267
lines changed

5 files changed

+121
-267
lines changed

server/Cargo.toml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@ object_store = { version = "0.5.1", features = ["aws"] }
2525
derive_more = "0.99.17"
2626
env_logger = "0.9.0"
2727
futures = "0.3"
28-
filetime = "0.2.17"
2928
http = "0.2.4"
3029
humantime-serde = "1.1.1"
3130
lazy_static = "1.4.0"
@@ -52,7 +51,6 @@ tokio = { version = "1.13.1", default-features = false, features = [
5251
clokwerk = "0.4.0-rc1"
5352
actix-web-static-files = "4.0"
5453
static-files = "0.2.1"
55-
walkdir = "2"
5654
ureq = { version = "2.5.0", features = ["json"] }
5755
uuid = { version = "1.2.1", features = ["v4", "fast-rng", "serde"] }
5856

server/src/event.rs

Lines changed: 47 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
*
1818
*/
1919
use actix_web::rt::spawn;
20-
use datafusion::arrow;
2120
use datafusion::arrow::datatypes::Schema;
2221
use datafusion::arrow::error::ArrowError;
2322
use datafusion::arrow::ipc::writer::StreamWriter;
@@ -36,11 +35,10 @@ use std::sync::RwLock;
3635

3736
use crate::metadata;
3837
use crate::metadata::LOCK_EXPECT;
39-
use crate::option::CONFIG;
4038
use crate::s3;
41-
use crate::storage::ObjectStorage;
39+
use crate::storage::{ObjectStorage, StorageDir};
4240

43-
use self::error::EventError;
41+
use self::error::{EventError, StreamWriterError};
4442

4543
type LocalWriter = Mutex<Option<StreamWriter<std::fs::File>>>;
4644
type LocalWriterGuard<'a> = MutexGuard<'a, Option<StreamWriter<std::fs::File>>>;
@@ -91,18 +89,7 @@ impl STREAM_WRITERS {
9189
.write()
9290
.map_err(|_| StreamWriterError::RwPoisoned)?;
9391

94-
let file = OpenOptions::new()
95-
.append(true)
96-
.create_new(true)
97-
.open(data_file_path(&stream))
98-
.map_err(StreamWriterError::Io)?;
99-
100-
let mut stream_writer = StreamWriter::try_new(file, &record.schema())
101-
.expect("File and RecordBatch both are checked");
102-
103-
stream_writer
104-
.write(record)
105-
.map_err(StreamWriterError::Writer)?;
92+
let stream_writer = init_new_stream_writer_file(&stream, record)?;
10693

10794
hashmap_guard.insert(stream, Mutex::new(Some(stream_writer)));
10895

@@ -125,63 +112,51 @@ impl STREAM_WRITERS {
125112
stream: &str,
126113
record: &RecordBatch,
127114
) -> Result<(), StreamWriterError> {
128-
let file = OpenOptions::new()
129-
.append(true)
130-
.create_new(true)
131-
.open(data_file_path(stream))
132-
.map_err(StreamWriterError::Io)?;
133-
134-
let mut stream_writer = StreamWriter::try_new(file, &record.schema())
135-
.expect("File and RecordBatch both are checked");
136-
137-
stream_writer
138-
.write(record)
139-
.map_err(StreamWriterError::Writer)?;
115+
let stream_writer = init_new_stream_writer_file(stream, record)?;
140116

141117
writer_guard.replace(stream_writer); // replace the stream writer behind this mutex
142118

143119
Ok(())
144120
}
145121

146-
// Unset the entry so that
147-
pub fn unset_entry(stream: &str) -> Result<(), StreamWriterError> {
148-
let guard = STREAM_WRITERS
122+
pub fn unset_all() -> Result<(), StreamWriterError> {
123+
let map = STREAM_WRITERS
149124
.read()
150125
.map_err(|_| StreamWriterError::RwPoisoned)?;
151-
let stream_writer = match guard.get(stream) {
152-
Some(writer) => writer,
153-
None => return Ok(()),
154-
};
155-
stream_writer
156-
.lock()
157-
.map_err(|_| StreamWriterError::MutexPoisoned)?
158-
.take();
126+
127+
for writer in map.values() {
128+
if let Some(mut streamwriter) = writer
129+
.lock()
130+
.map_err(|_| StreamWriterError::MutexPoisoned)?
131+
.take()
132+
{
133+
let _ = streamwriter.finish();
134+
}
135+
}
159136

160137
Ok(())
161138
}
162139
}
163140

164-
#[derive(Debug, thiserror::Error)]
165-
pub enum StreamWriterError {
166-
#[error("Arrow writer failed: {0}")]
167-
Writer(arrow::error::ArrowError),
168-
#[error("Io Error when creating new file: {0}")]
169-
Io(std::io::Error),
170-
#[error("RwLock was poisoned")]
171-
RwPoisoned,
172-
#[error("Mutex was poisoned")]
173-
MutexPoisoned,
174-
}
141+
fn init_new_stream_writer_file(
142+
stream_name: &str,
143+
record: &RecordBatch,
144+
) -> Result<StreamWriter<std::fs::File>, StreamWriterError> {
145+
let dir = StorageDir::new(stream_name);
146+
let path = dir.path_by_current_time();
147+
148+
std::fs::create_dir_all(dir.data_path)?;
149+
150+
let file = OpenOptions::new().create(true).append(true).open(path)?;
151+
152+
let mut stream_writer = StreamWriter::try_new(file, &record.schema())
153+
.expect("File and RecordBatch both are checked");
154+
155+
stream_writer
156+
.write(record)
157+
.map_err(StreamWriterError::Writer)?;
175158

176-
fn data_file_path(stream_name: &str) -> String {
177-
format!(
178-
"{}/{}",
179-
CONFIG
180-
.parseable
181-
.local_stream_data_path(stream_name)
182-
.to_string_lossy(),
183-
"data.records"
184-
)
159+
Ok(stream_writer)
185160
}
186161

187162
#[derive(Clone)]
@@ -309,7 +284,7 @@ impl Event {
309284
infer_json_schema(&mut buf_reader, None)
310285
}
311286

312-
fn get_reader(&self, arrow_schema: arrow::datatypes::Schema) -> json::Reader<&[u8]> {
287+
fn get_reader(&self, arrow_schema: Schema) -> json::Reader<&[u8]> {
313288
json::Reader::new(
314289
self.body.as_bytes(),
315290
Arc::new(arrow_schema),
@@ -348,8 +323,6 @@ pub mod error {
348323
use crate::storage::ObjectStorageError;
349324
use datafusion::arrow::error::ArrowError;
350325

351-
use super::StreamWriterError;
352-
353326
#[derive(Debug, thiserror::Error)]
354327
pub enum EventError {
355328
#[error("Missing Record from event body")]
@@ -365,4 +338,16 @@ pub mod error {
365338
#[error("Schema Mismatch: {0}")]
366339
ObjectStorage(#[from] ObjectStorageError),
367340
}
341+
342+
#[derive(Debug, thiserror::Error)]
343+
pub enum StreamWriterError {
344+
#[error("Arrow writer failed: {0}")]
345+
Writer(#[from] ArrowError),
346+
#[error("Io Error when creating new file: {0}")]
347+
Io(#[from] std::io::Error),
348+
#[error("RwLock was poisoned")]
349+
RwPoisoned,
350+
#[error("Mutex was poisoned")]
351+
MutexPoisoned,
352+
}
368353
}

server/src/handlers/logstream.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ pub async fn delete(req: HttpRequest) -> HttpResponse {
7070
)
7171
}
7272

73-
let stream_dir = StorageDir::new(stream_name.clone());
73+
let stream_dir = StorageDir::new(&stream_name);
7474
if fs::remove_dir_all(&stream_dir.data_path).is_err() {
7575
log::warn!(
7676
"failed to delete local data for stream {}. Clean {} manually",

server/src/main.rs

Lines changed: 3 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,7 @@ use actix_web::{middleware, web, App, HttpServer};
2222
use actix_web_httpauth::extractors::basic::BasicAuth;
2323
use actix_web_httpauth::middleware::HttpAuthentication;
2424
use actix_web_static_files::ResourceFiles;
25-
use chrono::{DateTime, NaiveDateTime, Timelike, Utc};
2625
use clokwerk::{AsyncScheduler, Scheduler, TimeUnits};
27-
use filetime::FileTime;
2826
use log::warn;
2927
use rustls::{Certificate, PrivateKey, ServerConfig};
3028
use rustls_pemfile::{certs, pkcs8_private_keys};
@@ -33,7 +31,7 @@ use thread_priority::{ThreadBuilder, ThreadPriority};
3331

3432
include!(concat!(env!("OUT_DIR"), "/generated.rs"));
3533

36-
use std::fs::{self, File};
34+
use std::fs::File;
3735
use std::io::BufReader;
3836
use std::panic::{catch_unwind, AssertUnwindSafe};
3937
use std::thread::{self, JoinHandle};
@@ -57,7 +55,7 @@ mod validator;
5755

5856
use option::CONFIG;
5957
use s3::S3;
60-
use storage::{ObjectStorage, StorageDir};
58+
use storage::ObjectStorage;
6159

6260
// Global configurations
6361
const MAX_EVENT_PAYLOAD_SIZE: usize = 1024000;
@@ -75,10 +73,6 @@ async fn main() -> anyhow::Result<()> {
7573
warn!("could not populate local metadata. {:?}", e);
7674
}
7775

78-
// Move all exiting data.records file to their respective tmp directory
79-
// they will be synced to object store on next s3 sync cycle
80-
startup_sync();
81-
8276
let (localsync_handler, mut localsync_outbox, localsync_inbox) = run_local_sync();
8377
let (mut s3sync_handler, mut s3sync_outbox, mut s3sync_inbox) = s3_sync();
8478

@@ -108,59 +102,6 @@ async fn main() -> anyhow::Result<()> {
108102
}
109103
}
110104

111-
fn startup_sync() {
112-
for stream in metadata::STREAM_INFO.list_streams() {
113-
let dir = StorageDir::new(stream.clone());
114-
115-
dir.create_temp_dir()
116-
.expect("Could not create temporary directory. Please check if Parseable is running with correct permissions.");
117-
118-
// if data.records file is not present then skip this stream
119-
if !dir.local_data_exists() {
120-
continue;
121-
}
122-
123-
// create prefix for this file from its last modified time
124-
let path = dir.data_path.join("data.records");
125-
126-
// metadata.modified gives us system time
127-
// This may not work on all platforms
128-
let metadata = match fs::metadata(&path) {
129-
Ok(meta) => meta,
130-
Err(err) => {
131-
log::warn!(
132-
"Failed to get file metadata for {} due to {:?}. Skipping!",
133-
path.display(),
134-
err
135-
);
136-
continue;
137-
}
138-
};
139-
140-
let last_modified = FileTime::from_last_modification_time(&metadata);
141-
let last_modified = NaiveDateTime::from_timestamp_opt(last_modified.unix_seconds(), 0);
142-
let last_modified: DateTime<Utc> = DateTime::from_utc(last_modified.unwrap(), Utc);
143-
144-
let uri = utils::date_to_prefix(last_modified.date_naive())
145-
+ &utils::hour_to_prefix(last_modified.hour())
146-
+ &utils::minute_to_prefix(
147-
last_modified.minute(),
148-
storage::OBJECT_STORE_DATA_GRANULARITY,
149-
)
150-
.unwrap();
151-
let local_uri = str::replace(&uri, "/", ".");
152-
let hostname = utils::hostname_unchecked();
153-
let parquet_file_local = format!("{}{}.data.parquet", local_uri, hostname);
154-
if let Err(err) = dir.move_local_to_temp(parquet_file_local) {
155-
panic!(
156-
"Failed to move parquet file at {} to tmp directory due to error {}",
157-
path.display(),
158-
err
159-
)
160-
}
161-
}
162-
}
163-
164105
fn s3_sync() -> (JoinHandle<()>, oneshot::Receiver<()>, oneshot::Sender<()>) {
165106
let (outbox_tx, outbox_rx) = oneshot::channel::<()>();
166107
let (inbox_tx, inbox_rx) = oneshot::channel::<()>();
@@ -218,7 +159,7 @@ fn run_local_sync() -> (JoinHandle<()>, oneshot::Receiver<()>, oneshot::Sender<(
218159
scheduler
219160
.every((storage::LOCAL_SYNC_INTERVAL as u32).seconds())
220161
.run(move || {
221-
if let Err(e) = S3::new().local_sync() {
162+
if let Err(e) = crate::event::STREAM_WRITERS::unset_all() {
222163
warn!("failed to sync local data. {:?}", e);
223164
}
224165
});

0 commit comments

Comments
 (0)