Skip to content

Commit 4d2897e

Browse files
author
Devdutt Shenoi
authored
refactor: ingestion data flow (#1100)
* refactor: kinesis message construction may panic * refactor: replace `BTreeMap` with `serde_json::Map` * refactor: get rid of clone * refactor: use `Value` for JSON data * refactor: `HeaderMap::get` and `let Some else` * refacror: ingest utils don't need http context anymore * refactor: more descriptive error variants * refactor: PUT stream header extraction * refactor: use Path and Json extractor * don't extract where not required * refactor: serde `date_list` * refactor: serde `DefaultPrivilege` * refactor: serde `Dashboard` * refactor: serde `Filter` * refactor: move up `p_timestamp` addition to recordbatch * refactor: refer over clone * fix: don't hog write privileges * refactor: DRY stream writer creation * refactor: serde `StreamType`
1 parent c5964fc commit 4d2897e

29 files changed

+597
-618
lines changed

src/catalog/mod.rs

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@ use crate::{
3333
query::PartialTimeFilter,
3434
storage::{object_storage::manifest_path, ObjectStorage, ObjectStorageError},
3535
};
36-
use bytes::Bytes;
3736
use chrono::{DateTime, Local, NaiveTime, Utc};
3837
use relative_path::RelativePathBuf;
3938
use std::io::Error as IOError;
@@ -412,13 +411,11 @@ pub async fn get_first_event(
412411
base_path_without_preceding_slash(),
413412
stream_name
414413
);
415-
// Convert dates vector to Bytes object
416-
let dates_bytes = Bytes::from(serde_json::to_vec(&dates).unwrap());
417414
let ingestor_first_event_at =
418415
handlers::http::cluster::send_retention_cleanup_request(
419416
&url,
420417
ingestor.clone(),
421-
dates_bytes,
418+
&dates,
422419
)
423420
.await?;
424421
if !ingestor_first_event_at.is_empty() {

src/event/format/mod.rs

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
use std::{
2121
collections::{HashMap, HashSet},
22+
fmt::Display,
2223
sync::Arc,
2324
};
2425

@@ -29,7 +30,10 @@ use chrono::DateTime;
2930
use serde::{Deserialize, Serialize};
3031
use serde_json::Value;
3132

32-
use crate::{metadata::SchemaVersion, utils::arrow::get_field};
33+
use crate::{
34+
metadata::SchemaVersion,
35+
utils::arrow::{get_field, get_timestamp_array, replace_columns},
36+
};
3337

3438
use super::DEFAULT_TIMESTAMP_KEY;
3539

@@ -73,6 +77,20 @@ impl From<&str> for LogSource {
7377
}
7478
}
7579

80+
impl Display for LogSource {
81+
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
82+
f.write_str(match self {
83+
LogSource::Kinesis => "kinesis",
84+
LogSource::OtelLogs => "otel-logs",
85+
LogSource::OtelMetrics => "otel-metrics",
86+
LogSource::OtelTraces => "otel-traces",
87+
LogSource::Json => "json",
88+
LogSource::Pmeta => "pmeta",
89+
LogSource::Custom(custom) => custom,
90+
})
91+
}
92+
}
93+
7694
// Global Trait for event format
7795
// This trait is implemented by all the event formats
7896
pub trait EventFormat: Sized {
@@ -126,7 +144,14 @@ pub trait EventFormat: Sized {
126144
}
127145
new_schema =
128146
update_field_type_in_schema(new_schema, None, time_partition, None, schema_version);
129-
let rb = Self::decode(data, new_schema.clone())?;
147+
148+
let mut rb = Self::decode(data, new_schema.clone())?;
149+
rb = replace_columns(
150+
rb.schema(),
151+
&rb,
152+
&[0],
153+
&[Arc::new(get_timestamp_array(rb.num_rows()))],
154+
);
130155

131156
Ok((rb, is_first))
132157
}

src/event/mod.rs

Lines changed: 10 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ use tracing::error;
2828

2929
use self::error::EventError;
3030
pub use self::writer::STREAM_WRITERS;
31-
use crate::{handlers::http::ingest::PostError, metadata, storage::StreamType};
31+
use crate::{metadata, storage::StreamType};
3232
use chrono::NaiveDateTime;
3333
use std::collections::HashMap;
3434

@@ -49,7 +49,7 @@ pub struct Event {
4949

5050
// Events holds the schema related to a each event for a single log stream
5151
impl Event {
52-
pub async fn process(&self) -> Result<(), EventError> {
52+
pub async fn process(self) -> Result<(), EventError> {
5353
let mut key = get_schema_key(&self.rb.schema().fields);
5454
if self.time_partition.is_some() {
5555
let parsed_timestamp_to_min = self.parsed_timestamp.format("%Y%m%dT%H%M").to_string();
@@ -69,10 +69,10 @@ impl Event {
6969
commit_schema(&self.stream_name, self.rb.schema())?;
7070
}
7171

72-
Self::process_event(
72+
STREAM_WRITERS.append_to_local(
7373
&self.stream_name,
7474
&key,
75-
self.rb.clone(),
75+
&self.rb,
7676
self.parsed_timestamp,
7777
&self.custom_partition_values,
7878
&self.stream_type,
@@ -98,44 +98,24 @@ impl Event {
9898
Ok(())
9999
}
100100

101-
pub fn process_unchecked(&self) -> Result<(), PostError> {
101+
pub fn process_unchecked(&self) -> Result<(), EventError> {
102102
let key = get_schema_key(&self.rb.schema().fields);
103103

104-
Self::process_event(
104+
STREAM_WRITERS.append_to_local(
105105
&self.stream_name,
106106
&key,
107-
self.rb.clone(),
107+
&self.rb,
108108
self.parsed_timestamp,
109109
&self.custom_partition_values,
110110
&self.stream_type,
111-
)
112-
.map_err(PostError::Event)
111+
)?;
112+
113+
Ok(())
113114
}
114115

115116
pub fn clear(&self, stream_name: &str) {
116117
STREAM_WRITERS.clear(stream_name);
117118
}
118-
119-
// event process all events after the 1st event. Concatenates record batches
120-
// and puts them in memory store for each event.
121-
fn process_event(
122-
stream_name: &str,
123-
schema_key: &str,
124-
rb: RecordBatch,
125-
parsed_timestamp: NaiveDateTime,
126-
custom_partition_values: &HashMap<String, String>,
127-
stream_type: &StreamType,
128-
) -> Result<(), EventError> {
129-
STREAM_WRITERS.append_to_local(
130-
stream_name,
131-
schema_key,
132-
rb,
133-
parsed_timestamp,
134-
custom_partition_values.clone(),
135-
stream_type,
136-
)?;
137-
Ok(())
138-
}
139119
}
140120

141121
pub fn get_schema_key(fields: &[Arc<Field>]) -> String {

src/event/writer/mem_writer.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ impl<const N: usize> Default for MemWriter<N> {
5050
}
5151

5252
impl<const N: usize> MemWriter<N> {
53-
pub fn push(&mut self, schema_key: &str, rb: RecordBatch) {
53+
pub fn push(&mut self, schema_key: &str, rb: &RecordBatch) {
5454
if !self.schema_map.contains(schema_key) {
5555
self.schema_map.insert(schema_key.to_owned());
5656
self.schema = Schema::try_merge([self.schema.clone(), (*rb.schema()).clone()]).unwrap();
@@ -97,7 +97,7 @@ pub struct MutableBuffer<const N: usize> {
9797
}
9898

9999
impl<const N: usize> MutableBuffer<N> {
100-
fn push(&mut self, rb: RecordBatch) -> Option<Vec<RecordBatch>> {
100+
fn push(&mut self, rb: &RecordBatch) -> Option<Vec<RecordBatch>> {
101101
if self.rows + rb.num_rows() >= N {
102102
let left = N - self.rows;
103103
let right = rb.num_rows() - left;
@@ -121,7 +121,7 @@ impl<const N: usize> MutableBuffer<N> {
121121
Some(inner)
122122
} else {
123123
self.rows += rb.num_rows();
124-
self.inner.push(rb);
124+
self.inner.push(rb.clone());
125125
None
126126
}
127127
}

0 commit comments

Comments
 (0)