fix(cubesql): Reduce memory usage while converting to DataFrame (#8598)

ovr · web-flow · commit 604085e5a206 · 2024-08-19T20:17:40.000+02:00
diff --git a/packages/cubejs-backend-native/src/utils.rs b/packages/cubejs-backend-native/src/utils.rs
@@ -27,7 +27,7 @@ pub fn bind_method<'a>(
 
 pub fn batch_to_rows(batch: RecordBatch) -> Result<(Value, Vec<Value>), CubeError> {
     let schema = batch.schema();
-    let data_frame = dataframe::batch_to_dataframe(&schema, &vec![batch])?;
+    let data_frame = dataframe::batches_to_dataframe(&schema, vec![batch])?;
 
     let columns = serde_json::to_value(data_frame.get_columns())?;
     let rows = data_frame
diff --git a/rust/cubesql/cubesql/src/compile/test/mod.rs b/rust/cubesql/cubesql/src/compile/test/mod.rs
@@ -16,7 +16,7 @@ use crate::{
     },
     config::{ConfigObj, ConfigObjImpl},
     sql::{
-        compiler_cache::CompilerCacheImpl, dataframe::batch_to_dataframe, AuthContextRef,
+        compiler_cache::CompilerCacheImpl, dataframe::batches_to_dataframe, AuthContextRef,
         AuthenticateResponse, HttpAuthContext, ServerManager, Session, SessionManager,
         SqlAuthService,
     },
@@ -839,7 +839,7 @@ impl TestContext {
                 QueryPlan::DataFusionSelect(flags, plan, ctx) => {
                     let df = DFDataFrame::new(ctx.state, &plan);
                     let batches = df.collect().await?;
-                    let frame = batch_to_dataframe(&df.schema().into(), &batches)?;
+                    let frame = batches_to_dataframe(&df.schema().into(), batches)?;
 
                     output.push(frame.print());
                     output_flags = flags;
diff --git a/rust/cubesql/cubesql/src/sql/dataframe.rs b/rust/cubesql/cubesql/src/sql/dataframe.rs
@@ -422,9 +422,9 @@ pub fn arrow_to_column_type(arrow_type: DataType) -> Result<ColumnType, CubeErro
     }
 }
 
-pub fn batch_to_dataframe(
+pub fn batches_to_dataframe(
     schema: &Schema,
-    batches: &Vec<RecordBatch>,
+    batches: Vec<RecordBatch>,
 ) -> Result<DataFrame, CubeError> {
     let mut cols = Vec::with_capacity(schema.fields().len());
     let mut all_rows = vec![];
@@ -437,7 +437,7 @@ pub fn batch_to_dataframe(
         ));
     }
 
-    for batch in batches.iter() {
+    for batch in batches.into_iter() {
         if batch.num_rows() == 0 {
             continue;
         }
diff --git a/rust/cubesql/cubesql/src/sql/postgres/extended.rs b/rust/cubesql/cubesql/src/sql/postgres/extended.rs
@@ -1,7 +1,7 @@
 use crate::{
     compile::QueryPlan,
     sql::{
-        dataframe::{batch_to_dataframe, DataFrame, TableValue},
+        dataframe::{batches_to_dataframe, DataFrame, TableValue},
         statement::PostgresStatementParamsBinder,
         temp_tables::TempTable,
         writer::BatchWriter,
@@ -390,7 +390,7 @@ impl Portal {
             }
         };
 
-        let frame = batch_to_dataframe(batch_for_write.schema().as_ref(), &vec![batch_for_write])?;
+        let frame = batches_to_dataframe(batch_for_write.schema().as_ref(), vec![batch_for_write])?;
 
         Ok((unused, self.dataframe_to_writer(frame)?))
     }

Original file line number	Diff line number	Diff line change
`@@ -422,9 +422,9 @@ pub fn arrow_to_column_type(arrow_type: DataType) -> Result<ColumnType, CubeErro`
`422`	`422`	`}`
`423`	`423`	`}`
`424`	`424`
`425`		`-pub fn batch_to_dataframe(`
	`425`	`+pub fn batches_to_dataframe(`
`426`	`426`	`schema: &Schema,`
`427`		`- batches: &Vec<RecordBatch>,`
	`427`	`+ batches: Vec<RecordBatch>,`
`428`	`428`	`) -> Result<DataFrame, CubeError> {`
`429`	`429`	`let mut cols = Vec::with_capacity(schema.fields().len());`
`430`	`430`	`let mut all_rows = vec![];`
`@@ -437,7 +437,7 @@ pub fn batch_to_dataframe(`
`437`	`437`	`));`
`438`	`438`	`}`
`439`	`439`
`440`		`- for batch in batches.iter() {`
	`440`	`+ for batch in batches.into_iter() {`
`441`	`441`	`if batch.num_rows() == 0 {`
`442`	`442`	`continue;`
`443`	`443`	`}`