Skip to content

Commit 47b0f8c

Browse files
authored
Merge pull request #3 from Schwarzam/testing-new
redone all code with parquet -- not using polars anymore
2 parents b6ae52b + 78794ad commit 47b0f8c

File tree

13 files changed

+276
-276
lines changed

13 files changed

+276
-276
lines changed

Cargo.toml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,10 @@ name = "lsdb_server"
1717
path = "src/bin.rs"
1818

1919
[dependencies]
20+
futures-util = "0.3.30"
21+
arrow = "52.0.0"
22+
parquet = { version = "52.0.0", features = ["arrow", "async"] }
2023
axum = "0.7.5"
21-
polars = { version = "0.40.0", features = ["lazy", "parquet", "dtype-u8"] }
2224
tokio = { version = "1.37.0", features = ["full"] }
2325
hyper = { version="1.3.1", features = ["full"] }
2426
tower = "0.4.13"

src/loaders/mod.rs

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1 @@
1-
pub mod parquet;
2-
pub mod parsers;
1+
pub mod parquet;

src/loaders/parquet.rs

Lines changed: 0 additions & 81 deletions
This file was deleted.

src/loaders/parquet/helpers.rs

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
use arrow::array::{Float64Array, Float32Array, Int16Array, Int32Array, Int64Array, Int8Array, BooleanArray};
2+
use arrow::record_batch::RecordBatch;
3+
use arrow::array::BooleanBuilder;
4+
use arrow::datatypes::Schema;
5+
use std::sync::Arc;
6+
7+
/// Create a boolean mask based on the filters provided.
8+
///
9+
/// # Arguments
10+
///
11+
/// * `batch` - A reference to a RecordBatch that will be filtered.
12+
/// * `original_schema` - A reference to the original schema of the RecordBatch.
13+
/// * `filters` - A vector of tuples containing the column name, the comparison operator and the value to compare.
14+
///
15+
/// # Returns
16+
///
17+
/// This function returns an Arrow Result with the boolean mask.
18+
pub fn create_boolean_mask(batch: &RecordBatch, original_schema: &Arc<Schema>, filters: Vec<(&str, &str, &str)>) -> arrow::error::Result<Arc<BooleanArray>> {
19+
let num_rows = batch.num_rows();
20+
let mut boolean_builder = BooleanBuilder::new();
21+
22+
// Initialize all rows as true
23+
for _ in 0..num_rows {
24+
boolean_builder.append_value(true);
25+
}
26+
let mut boolean_mask = boolean_builder.finish();
27+
28+
for filter in filters.iter() {
29+
let column = batch.column(original_schema.index_of(filter.0).unwrap());
30+
31+
if column.data_type() == &arrow::datatypes::DataType::Float32 {
32+
let column = column.as_any().downcast_ref::<Float32Array>().unwrap();
33+
apply_filter(&mut boolean_mask, column, filter)?;
34+
} else if column.data_type() == &arrow::datatypes::DataType::Float64 {
35+
let column = column.as_any().downcast_ref::<Float64Array>().unwrap();
36+
apply_filter(&mut boolean_mask, column, filter)?;
37+
} else if column.data_type() == &arrow::datatypes::DataType::Int16 {
38+
let column = column.as_any().downcast_ref::<Int16Array>().unwrap();
39+
apply_filter(&mut boolean_mask, column, filter)?;
40+
} else if column.data_type() == &arrow::datatypes::DataType::Int32 {
41+
let column = column.as_any().downcast_ref::<Int32Array>().unwrap();
42+
apply_filter(&mut boolean_mask, column, filter)?;
43+
} else if column.data_type() == &arrow::datatypes::DataType::Int64 {
44+
let column = column.as_any().downcast_ref::<Int64Array>().unwrap();
45+
apply_filter(&mut boolean_mask, column, filter)?;
46+
} else if column.data_type() == &arrow::datatypes::DataType::Int8 {
47+
let column = column.as_any().downcast_ref::<Int8Array>().unwrap();
48+
apply_filter(&mut boolean_mask, column, filter)?;
49+
} else if column.data_type() == &arrow::datatypes::DataType::Boolean {
50+
let column = column.as_any().downcast_ref::<Int16Array>().unwrap();
51+
apply_filter(&mut boolean_mask, column, filter)?;
52+
} else {
53+
return Err(arrow::error::ArrowError::NotYetImplemented(format!("Data type {:?} not yet implemented", column.data_type())));
54+
}
55+
}
56+
Ok(Arc::new(boolean_mask))
57+
}
58+
59+
/// Apply a filter to a column and update the boolean mask.
60+
///
61+
/// # Arguments
62+
///
63+
/// * `boolean_mask` - A mutable reference to a BooleanArray that will be updated with the filter results.
64+
/// * `column` - A reference to a PrimitiveArray that will be filtered.
65+
/// * `filter` - A tuple containing the column name, the comparison operator and the value to compare.
66+
///
67+
/// # Returns
68+
///
69+
/// This function returns an Arrow Result.
70+
fn apply_filter<T>(boolean_mask: &mut BooleanArray, column: &arrow::array::PrimitiveArray<T>, filter: &(&str, &str, &str)) -> arrow::error::Result<()>
71+
where
72+
T: arrow::datatypes::ArrowPrimitiveType,
73+
T::Native: std::cmp::PartialOrd + std::str::FromStr,
74+
<T::Native as std::str::FromStr>::Err: std::fmt::Debug,
75+
{
76+
let filter_value = filter.2.parse::<T::Native>().unwrap();
77+
let mut new_mask = BooleanBuilder::new();
78+
79+
for (index, value) in column.iter().enumerate() {
80+
let current_mask = boolean_mask.value(index);
81+
let result = match filter.1 {
82+
">" => value.map_or(false, |v| v > filter_value),
83+
"<" => value.map_or(false, |v| v < filter_value),
84+
"=" => value.map_or(false, |v| v == filter_value),
85+
"!=" => value.map_or(false, |v| v != filter_value),
86+
">=" => value.map_or(false, |v| v >= filter_value),
87+
"<=" => value.map_or(false, |v| v <= filter_value),
88+
"==" => value.map_or(false, |v| v == filter_value),
89+
_ => false,
90+
};
91+
new_mask.append_value(current_mask && result);
92+
}
93+
94+
*boolean_mask = new_mask.finish();
95+
Ok(())
96+
}

src/loaders/parquet/mod.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
pub mod parse_params;
2+
pub mod helpers;
3+
pub mod parquet;

src/loaders/parquet/parquet.rs

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
2+
use std::collections::HashMap;
3+
use std::error::Error;
4+
use std::sync::Arc;
5+
6+
use arrow::array::{ArrayRef, NullArray};
7+
use arrow::array::new_null_array;
8+
use arrow::record_batch::RecordBatch;
9+
10+
use parquet::arrow::async_reader::ParquetRecordBatchStreamBuilder;
11+
use parquet::arrow::arrow_reader::ArrowReaderMetadata;
12+
use parquet::arrow::arrow_writer::ArrowWriter;
13+
use parquet::file::properties::WriterProperties;
14+
15+
use futures_util::stream::StreamExt;
16+
use tokio::fs::File;
17+
18+
use crate::loaders::parquet::parse_params;
19+
use crate::loaders::parquet::helpers::create_boolean_mask;
20+
21+
/// Process a Parquet file and return the content as a byte stream.
22+
///
23+
/// # Arguments
24+
///
25+
/// * `file_path` - A reference to a string containing the path to the Parquet file.
26+
/// * `params` - A reference to a HashMap of parameters containing 'columns' and 'filters' keys.
27+
///
28+
/// # Returns
29+
///
30+
/// This function returns a byte stream that can be directly used as an HTTP response body.
31+
pub async fn process_and_return_parquet_file(
32+
file_path: &str,
33+
params: &HashMap<String, String>
34+
) -> Result<Vec<u8>, Box<dyn Error>> {
35+
// Open async file containing parquet data
36+
let std_file = std::fs::File::open(file_path)?;
37+
let mut file = File::from_std(std_file);
38+
39+
let meta = ArrowReaderMetadata::load_async(&mut file, Default::default()).await?;
40+
let stream_builder = ParquetRecordBatchStreamBuilder::new_with_metadata(
41+
file.try_clone().await?,
42+
meta.clone()
43+
);
44+
let original_metadata = meta.metadata();
45+
let metadata_keys = original_metadata
46+
.file_metadata()
47+
.key_value_metadata()
48+
.unwrap()
49+
.clone();
50+
51+
let original_schema = stream_builder
52+
.schema()
53+
.clone();
54+
55+
let all_columns = original_schema
56+
.fields()
57+
.iter()
58+
.map(|field| field.name().to_string())
59+
.collect::<Vec<_>>();
60+
61+
// Parse selected columns from params
62+
let columns = parse_params::parse_columns_from_params_to_str(&params)
63+
.unwrap_or(all_columns);
64+
65+
let filters = parse_params::parse_filters(&params);
66+
67+
// Construct the reader stream
68+
let mut stream = stream_builder
69+
.with_batch_size(8192)
70+
.build()?;
71+
72+
// Set writer properties with the original metadata
73+
let writer_properties = WriterProperties::builder()
74+
.set_key_value_metadata(Some(metadata_keys))
75+
.build();
76+
77+
let mut out_buffer = Vec::new();
78+
let mut writer = ArrowWriter::try_new(
79+
&mut out_buffer,
80+
original_schema.clone(),
81+
Some(writer_properties)
82+
)?;
83+
84+
// Collect all batches and write them to the buffer
85+
while let Some(batch) = stream.next().await {
86+
let mut batch = batch?;
87+
88+
//let predicate = arrow::compute::FilterBuilder::new(&batch, &projection)?;
89+
if filters.is_some() {
90+
let filter_mask = &create_boolean_mask(
91+
&batch,
92+
&original_schema,
93+
filters.clone().unwrap()
94+
).unwrap();
95+
batch = arrow::compute::filter_record_batch(
96+
&batch,
97+
&filter_mask
98+
)?;
99+
}
100+
101+
let selected_arrays = original_schema.fields().iter()
102+
.map(|field| {
103+
if let Ok(index) = batch.schema().index_of(field.name()) {
104+
if columns.contains(&field.name().to_string()) || &field.name().to_string() == "_hipscat_index" {
105+
batch.column(index).clone()
106+
} else {
107+
new_null_array(
108+
field.data_type(),
109+
batch.num_rows()
110+
)
111+
}
112+
} else {
113+
Arc::new(NullArray::new(batch.num_rows())) as ArrayRef
114+
}
115+
})
116+
.collect::<Vec<_>>();
117+
118+
let selected_batch = RecordBatch::try_new(original_schema.clone(), selected_arrays)?;
119+
writer.write(&selected_batch)?;
120+
}
121+
122+
writer.finish()?;
123+
let _ = writer.close();
124+
Ok(out_buffer)
125+
}
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
use std::collections::HashMap;
2+
use regex::Regex;
3+
4+
/// # Arguments
5+
///
6+
/// * `params` - A reference to a HashMap of parameters containing 'columns' key.
7+
///
8+
/// # Returns
9+
///
10+
/// A vector of Polars with the selected columns.
11+
pub fn parse_columns_from_params_to_str( params: &HashMap<String, String> ) -> Option<Vec<String>> {
12+
// Parse columns from params
13+
if let Some(cols) = params.get("columns") {
14+
let cols = cols.split(",").collect::<Vec<_>>();
15+
let select_cols = cols.iter().map(|x| x.to_string()).collect::<Vec<_>>();
16+
return Some(select_cols);
17+
}
18+
None
19+
}
20+
21+
/// # Arguments
22+
///
23+
/// * `params` - A reference to a HashMap of parameters containing 'filters' key.
24+
///
25+
/// # Returns
26+
///
27+
/// A vector of tuples containing the column name, the comparison operator and the value to compare.
28+
pub fn parse_filters(params: &HashMap<String, String>) -> Option<Vec<(&str, &str, &str)>> {
29+
let mut filters = Vec::new();
30+
if let Some(query) = params.get("filters") {
31+
filters = query.split(",").collect::<Vec<_>>();
32+
}
33+
34+
if filters.len() == 0 {
35+
return None
36+
}
37+
38+
let re = Regex::new(r"([a-zA-Z_]+)([<>=]+)([-+]?[0-9]*\.?[0-9]*)").unwrap();
39+
let mut filter_vec = Vec::new();
40+
for filter in filters {
41+
let f_vec = re.captures(filter).unwrap();
42+
filter_vec.push((f_vec.get(1).unwrap().as_str(), f_vec.get(2).unwrap().as_str(), f_vec.get(3).unwrap().as_str()));
43+
}
44+
45+
Some(filter_vec)
46+
}

0 commit comments

Comments
 (0)