Skip to content

Commit f9fb6c5

Browse files
authored
Move to datafusion 26 and arrow 40 (#440)
Part of #438
1 parent 553c8c0 commit f9fb6c5

File tree

14 files changed

+242
-218
lines changed

14 files changed

+242
-218
lines changed

Cargo.lock

Lines changed: 84 additions & 86 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

server/Cargo.toml

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,11 @@ actix-cors = "0.6"
1515
actix-web-prometheus = { version = "0.1" }
1616
prometheus = { version = "0.13", features = ["process"] }
1717
anyhow = { version = "1.0", features = ["backtrace"] }
18-
arrow-schema = { version = "36.0.0", features = ["serde"] }
19-
arrow-array = { version = "36.0.0" }
20-
arrow-json = "36.0.0"
21-
arrow-ipc = "36.0.0"
22-
arrow-select = "36.0.0"
18+
arrow-schema = { version = "40.0.0", features = ["serde"] }
19+
arrow-array = { version = "40.0.0" }
20+
arrow-json = "40.0.0"
21+
arrow-ipc = "40.0.0"
22+
arrow-select = "40.0.0"
2323
async-trait = "0.1"
2424
base64 = "0.21"
2525
bytes = "1.4"
@@ -35,7 +35,7 @@ clap = { version = "4.1", default-features = false, features = [
3535
"error-context",
3636
] }
3737
crossterm = "0.26"
38-
datafusion = "22.0.0"
38+
datafusion = "26.0.0"
3939
object_store = { version = "0.5.6", features = ["aws", "aws_profile"] }
4040
derive_more = "0.99"
4141
env_logger = "0.10"
@@ -49,7 +49,12 @@ sysinfo = "0.28.4"
4949
hostname = "0.3"
5050
rand = "0.8"
5151
relative-path = { version = "1.7", features = ["serde"] }
52-
reqwest = { version = "0.11", default_features=false, features=["rustls", "json", "hyper-rustls", "tokio-rustls"]}
52+
reqwest = { version = "0.11", default_features = false, features = [
53+
"rustls",
54+
"json",
55+
"hyper-rustls",
56+
"tokio-rustls",
57+
] }
5358
rustls = "0.20"
5459
rustls-pemfile = "1.0"
5560
semver = "1.0"
@@ -70,10 +75,10 @@ ulid = { version = "1.0", features = ["serde"] }
7075
hex = "0.4"
7176
itertools = "0.10"
7277
xxhash-rust = { version = "0.8", features = ["xxh3"] }
73-
xz2 = { version = "*", features=["static"] }
74-
bzip2 = { version = "*", features=["static"] }
78+
xz2 = { version = "*", features = ["static"] }
79+
bzip2 = { version = "*", features = ["static"] }
7580
once_cell = "1.17.1"
76-
parquet = "36.0.0"
81+
parquet = "40.0.0"
7782
pyroscope = { version = "0.5.3", optional = true }
7883
pyroscope_pprofrs = { version = "0.2", optional = true }
7984
uptime_lib = "0.2.2"

server/src/alerts/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,7 @@ impl Message {
136136
// checks if message (with a column name) is valid (i.e. the column name is present in the schema)
137137
pub fn valid(&self, schema: &Schema, column: Option<&str>) -> bool {
138138
if let Some(col) = column {
139-
return get_field(schema, col).is_some();
139+
return get_field(&schema.fields, col).is_some();
140140
}
141141
true
142142
}

server/src/event.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ pub mod format;
2121
mod writer;
2222

2323
use arrow_array::RecordBatch;
24-
use arrow_schema::{Field, Schema};
24+
use arrow_schema::{Field, Fields, Schema};
2525
use itertools::Itertools;
2626

2727
use std::sync::Arc;
@@ -85,7 +85,7 @@ impl Event {
8585
}
8686
}
8787

88-
pub fn get_schema_key(fields: &[Field]) -> String {
88+
pub fn get_schema_key(fields: &[Arc<Field>]) -> String {
8989
// Fields must be sorted
9090
let mut hasher = xxhash_rust::xxh3::Xxh3::new();
9191
for field in fields.iter().sorted_by_key(|v| v.name()) {
@@ -102,10 +102,10 @@ pub fn commit_schema(stream_name: &str, schema: Arc<Schema>) -> Result<(), Event
102102
.get_mut(stream_name)
103103
.expect("map has entry for this stream name")
104104
.schema;
105-
let current_schema = Schema::new(map.values().cloned().collect());
105+
let current_schema = Schema::new(map.values().cloned().collect::<Fields>());
106106
let schema = Schema::try_merge(vec![current_schema, schema.as_ref().clone()])?;
107107
map.clear();
108-
map.extend(schema.fields.into_iter().map(|f| (f.name().clone(), f)));
108+
map.extend(schema.fields.iter().map(|f| (f.name().clone(), f.clone())));
109109
Ok(())
110110
}
111111

server/src/event/format.rs

Lines changed: 18 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -32,19 +32,21 @@ pub mod json;
3232

3333
type Tags = String;
3434
type Metadata = String;
35+
type EventSchema = Vec<Arc<Field>>;
3536

3637
// Global Trait for event format
3738
// This trait is implemented by all the event formats
3839
pub trait EventFormat: Sized {
3940
type Data;
41+
4042
fn to_data(
4143
self,
42-
schema: &HashMap<String, Field>,
43-
) -> Result<(Self::Data, Schema, bool, Tags, Metadata), AnyError>;
44+
schema: HashMap<String, Arc<Field>>,
45+
) -> Result<(Self::Data, EventSchema, bool, Tags, Metadata), AnyError>;
4446
fn decode(data: Self::Data, schema: Arc<Schema>) -> Result<RecordBatch, AnyError>;
4547
fn into_recordbatch(
4648
self,
47-
schema: &HashMap<String, Field>,
49+
schema: HashMap<String, Arc<Field>>,
4850
) -> Result<(RecordBatch, bool), AnyError> {
4951
let (data, mut schema, is_first, tags, metadata) = self.to_data(schema)?;
5052

@@ -67,36 +69,36 @@ pub trait EventFormat: Sized {
6769
};
6870

6971
// add the p_timestamp field to the event schema to the 0th index
70-
schema.fields.insert(
72+
schema.insert(
7173
0,
72-
Field::new(
74+
Arc::new(Field::new(
7375
DEFAULT_TIMESTAMP_KEY,
7476
DataType::Timestamp(TimeUnit::Millisecond, None),
7577
true,
76-
),
78+
)),
7779
);
7880

7981
// p_tags and p_metadata are added to the end of the schema
80-
let tags_index = schema.fields.len();
82+
let tags_index = schema.len();
8183
let metadata_index = tags_index + 1;
82-
schema
83-
.fields
84-
.push(Field::new(DEFAULT_TAGS_KEY, DataType::Utf8, true));
85-
schema
86-
.fields
87-
.push(Field::new(DEFAULT_METADATA_KEY, DataType::Utf8, true));
84+
schema.push(Arc::new(Field::new(DEFAULT_TAGS_KEY, DataType::Utf8, true)));
85+
schema.push(Arc::new(Field::new(
86+
DEFAULT_METADATA_KEY,
87+
DataType::Utf8,
88+
true,
89+
)));
8890

8991
// prepare the record batch and new fields to be added
90-
let schema_ref = Arc::new(schema);
91-
let rb = Self::decode(data, Arc::clone(&schema_ref))?;
92+
let schema = Arc::new(Schema::new(schema));
93+
let rb = Self::decode(data, schema.clone())?;
9294
let tags_arr = StringArray::from_iter_values(std::iter::repeat(&tags).take(rb.num_rows()));
9395
let metadata_arr =
9496
StringArray::from_iter_values(std::iter::repeat(&metadata).take(rb.num_rows()));
9597
let timestamp_array = get_timestamp_array(rb.num_rows());
9698

9799
// modify the record batch to add fields to respective indexes
98100
let rb = utils::arrow::replace_columns(
99-
Arc::clone(&schema_ref),
101+
Arc::clone(&schema),
100102
rb,
101103
&[0, tags_index, metadata_index],
102104
&[

server/src/event/format/json.rs

Lines changed: 22 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -21,19 +21,19 @@
2121

2222
use anyhow::anyhow;
2323
use arrow_array::RecordBatch;
24-
use arrow_json::reader::{infer_json_schema_from_iterator, Decoder, DecoderOptions};
25-
use arrow_schema::{DataType, Field, Schema};
24+
use arrow_json::reader::{infer_json_schema_from_iterator, ReaderBuilder};
25+
use arrow_schema::{DataType, Field, Fields, Schema};
2626
use datafusion::arrow::util::bit_util::round_upto_multiple_of_64;
2727
use serde_json::Value;
2828
use std::{collections::HashMap, sync::Arc};
2929

30-
use super::EventFormat;
30+
use super::{EventFormat, Metadata, Tags};
3131
use crate::utils::{arrow::get_field, json::flatten_json_body};
3232

3333
pub struct Event {
3434
pub data: Value,
35-
pub tags: String,
36-
pub metadata: String,
35+
pub tags: Tags,
36+
pub metadata: Metadata,
3737
}
3838

3939
impl EventFormat for Event {
@@ -43,10 +43,9 @@ impl EventFormat for Event {
4343
// also extract the arrow schema, tags and metadata from the incoming json
4444
fn to_data(
4545
self,
46-
schema: &HashMap<String, Field>,
47-
) -> Result<(Self::Data, Schema, bool, String, String), anyhow::Error> {
46+
schema: HashMap<String, Arc<Field>>,
47+
) -> Result<(Self::Data, Vec<Arc<Field>>, bool, Tags, Metadata), anyhow::Error> {
4848
let data = flatten_json_body(self.data)?;
49-
5049
let stream_schema = schema;
5150

5251
// incoming event may be a single json or a json array
@@ -63,18 +62,18 @@ impl EventFormat for Event {
6362
collect_keys(value_arr.iter()).expect("fields can be collected from array of objects");
6463

6564
let mut is_first = false;
66-
let schema = match derive_arrow_schema(stream_schema, fields) {
65+
let schema = match derive_arrow_schema(&stream_schema, fields) {
6766
Ok(schema) => schema,
6867
Err(_) => match infer_json_schema_from_iterator(value_arr.iter().map(Ok)) {
6968
Ok(infer_schema) => {
7069
if let Err(err) = Schema::try_merge(vec![
71-
Schema::new(stream_schema.values().cloned().collect()),
70+
Schema::new(stream_schema.values().cloned().collect::<Fields>()),
7271
infer_schema.clone(),
7372
]) {
7473
return Err(anyhow!("Could not merge schema of this event with that of the existing stream. {:?}", err));
7574
}
7675
is_first = true;
77-
infer_schema
76+
infer_schema.fields.iter().cloned().collect()
7877
}
7978
Err(err) => {
8079
return Err(anyhow!(
@@ -100,13 +99,13 @@ impl EventFormat for Event {
10099
// Convert the Data type (defined above) to arrow record batch
101100
fn decode(data: Self::Data, schema: Arc<Schema>) -> Result<RecordBatch, anyhow::Error> {
102101
let array_capacity = round_upto_multiple_of_64(data.len());
103-
let value_iter: &mut (dyn Iterator<Item = Value>) = &mut data.into_iter();
102+
let mut reader = ReaderBuilder::new(schema)
103+
.with_batch_size(array_capacity)
104+
.with_coerce_primitive(false)
105+
.build_decoder()?;
104106

105-
let reader = Decoder::new(
106-
schema,
107-
DecoderOptions::new().with_batch_size(array_capacity),
108-
);
109-
match reader.next_batch(&mut value_iter.map(Ok)) {
107+
reader.serialize(&data)?;
108+
match reader.flush() {
110109
Ok(Some(recordbatch)) => Ok(recordbatch),
111110
Err(err) => Err(anyhow!("Failed to create recordbatch due to {:?}", err)),
112111
Ok(None) => unreachable!("all records are added to one rb"),
@@ -116,14 +115,17 @@ impl EventFormat for Event {
116115

117116
// Returns arrow schema with the fields that are present in the request body
118117
// This schema is an input to convert the request body to arrow record batch
119-
fn derive_arrow_schema(schema: &HashMap<String, Field>, fields: Vec<&str>) -> Result<Schema, ()> {
118+
fn derive_arrow_schema(
119+
schema: &HashMap<String, Arc<Field>>,
120+
fields: Vec<&str>,
121+
) -> Result<Vec<Arc<Field>>, ()> {
120122
let mut res = Vec::with_capacity(fields.len());
121123
let fields = fields.into_iter().map(|field_name| schema.get(field_name));
122124
for field in fields {
123125
let Some(field) = field else { return Err(()) };
124126
res.push(field.clone())
125127
}
126-
Ok(Schema::new(res))
128+
Ok(res)
127129
}
128130

129131
fn collect_keys<'a>(values: impl Iterator<Item = &'a Value>) -> Result<Vec<&'a str>, ()> {
@@ -145,7 +147,7 @@ fn collect_keys<'a>(values: impl Iterator<Item = &'a Value>) -> Result<Vec<&'a s
145147
Ok(keys)
146148
}
147149

148-
fn fields_mismatch(schema: &Schema, body: &Value) -> bool {
150+
fn fields_mismatch(schema: &[Arc<Field>], body: &Value) -> bool {
149151
for (name, val) in body.as_object().expect("body is of object variant") {
150152
if val.is_null() {
151153
continue;

0 commit comments

Comments
 (0)