Disable query for stage data (#326)

trueleo · web-flow · commit 347f7ed59595 · 2023-03-07T12:38:46.000+05:30
This PR disables the query flow to query staged data,
since current flow allocates too much memory to load
everything in RAM. 

This is temporary, and we'll enable this again as we 
fix the memory issue.

With this change, when a query is run, users will see
only the data that is persisted in backend store (and not
the staging area data)
diff --git a/server/src/query.rs b/server/src/query.rs
@@ -26,20 +26,16 @@ use datafusion::datasource::TableProvider;
 use datafusion::prelude::*;
 use itertools::Itertools;
 use serde_json::Value;
-use std::collections::hash_map::RandomState;
-use std::collections::HashSet;
-use std::path::{Path, PathBuf};
+use std::path::Path;
 use std::sync::Arc;
 
 use crate::option::CONFIG;
 use crate::storage::ObjectStorageError;
-use crate::storage::StorageDir;
 use crate::storage::{ObjectStorage, OBJECT_STORE_DATA_GRANULARITY};
 use crate::utils::TimePeriod;
 use crate::validator;
 
 use self::error::{ExecuteError, ParseError};
-use table_provider::QueryTableProvider;
 
 type Key = &'static str;
 fn get_value(value: &Value, key: Key) -> Result<&str, Key> {
@@ -89,41 +85,18 @@ impl Query {
         &self,
         storage: Arc<dyn ObjectStorage + Send>,
     ) -> Result<(Vec<RecordBatch>, Vec<String>), ExecuteError> {
-        let dir = StorageDir::new(&self.stream_name);
-        // take a look at local dir and figure out what local cache we could use for this query
-        let staging_arrows = dir
-            .arrow_files_grouped_by_time()
-            .into_iter()
-            .filter(|(path, _)| path_intersects_query(path, self.start, self.end))
-            .sorted_by(|(a, _), (b, _)| Ord::cmp(a, b))
-            .collect_vec();
-
-        let staging_parquet_set: HashSet<&PathBuf, RandomState> =
-            HashSet::from_iter(staging_arrows.iter().map(|(p, _)| p));
-
-        let other_staging_parquet = dir
-            .parquet_files()
-            .into_iter()
-            .filter(|path| path_intersects_query(path, self.start, self.end))
-            .filter(|path| !staging_parquet_set.contains(path))
-            .collect_vec();
-
         let ctx = SessionContext::with_config_rt(
             SessionConfig::default(),
             CONFIG.storage().get_datafusion_runtime(),
         );
 
-        let table = Arc::new(QueryTableProvider::new(
-            staging_arrows,
-            other_staging_parquet,
-            self.get_prefixes(),
-            storage,
-            Arc::new(self.get_schema().clone()),
-        ));
+        let Some(table) = storage.query_table(self.get_prefixes(), Arc::new(self.get_schema().clone()))? else {
+            return Ok((Vec::new(), Vec::new()));
+        };
 
         ctx.register_table(
             &*self.stream_name,
-            Arc::clone(&table) as Arc<dyn TableProvider>,
+            Arc::new(table) as Arc<dyn TableProvider>,
         )
         .map_err(ObjectStorageError::DataFusionError)?;
         // execute the query and collect results
@@ -144,11 +117,13 @@ impl Query {
     }
 }
 
+#[allow(unused)]
 fn path_intersects_query(path: &Path, starttime: DateTime<Utc>, endtime: DateTime<Utc>) -> bool {
     let time = time_from_path(path);
     starttime <= time && time <= endtime
 }
 
+#[allow(unused)]
 fn time_from_path(path: &Path) -> DateTime<Utc> {
     let prefix = path
         .file_name()
diff --git a/server/src/query/table_provider.rs b/server/src/query/table_provider.rs
@@ -16,6 +16,8 @@
  *
  */
 
+#![allow(unused)]
+
 use async_trait::async_trait;
 use datafusion::arrow::datatypes::{Schema, SchemaRef};
 use datafusion::arrow::ipc::reader::StreamReader;
@@ -92,6 +94,7 @@ impl QueryTableProvider {
                 parquet_files.push(staging_parquet.clone())
             }
         }
+
         parquet_files.extend(self.other_staging_parquet.clone());
 
         let memtable = MemTable::try_new(Arc::clone(&self.schema), mem_records)?;
diff --git a/server/src/storage.rs b/server/src/storage.rs
@@ -248,6 +248,7 @@ impl StorageDir {
         paths
     }
 
+    #[allow(unused)]
     pub fn arrow_files_grouped_by_time(&self) -> HashMap<PathBuf, Vec<PathBuf>> {
         // hashmap <time, vec[paths]>
         let mut grouped_arrow_file: HashMap<PathBuf, Vec<PathBuf>> = HashMap::new();
diff --git a/server/src/storage/object_storage.rs b/server/src/storage/object_storage.rs
@@ -389,10 +389,7 @@ impl MergedRecordReader {
     }
 
     pub fn merged_iter(self, schema: &Schema) -> impl Iterator<Item = RecordBatch> + '_ {
-        let adapted_readers = self
-            .readers
-            .into_iter()
-            .map(move |reader| reader.flatten().map(|batch| adapt_batch(schema, batch)));
+        let adapted_readers = self.readers.into_iter().map(move |reader| reader.flatten());
 
         kmerge_by(adapted_readers, |a: &RecordBatch, b: &RecordBatch| {
             let a: &TimestampMillisecondArray = a
@@ -409,6 +406,7 @@ impl MergedRecordReader {
 
             a.value(0) < b.value(0)
         })
+        .map(|batch| adapt_batch(schema, batch))
     }
 
     pub fn merged_schema(&self) -> Schema {

Original file line number	Diff line number	Diff line change
`@@ -16,6 +16,8 @@`
`16`	`16`	`*`
`17`	`17`	`*/`
`18`	`18`
	`19`	`+#![allow(unused)]`
	`20`	`+`
`19`	`21`	`use async_trait::async_trait;`
`20`	`22`	`use datafusion::arrow::datatypes::{Schema, SchemaRef};`
`21`	`23`	`use datafusion::arrow::ipc::reader::StreamReader;`
`@@ -92,6 +94,7 @@ impl QueryTableProvider {`
`92`	`94`	`parquet_files.push(staging_parquet.clone())`
`93`	`95`	`}`
`94`	`96`	`}`
	`97`	`+`
`95`	`98`	`parquet_files.extend(self.other_staging_parquet.clone());`
`96`	`99`
`97`	`100`	`let memtable = MemTable::try_new(Arc::clone(&self.schema), mem_records)?;`
Original file line number	Diff line number	Diff line change
`@@ -248,6 +248,7 @@ impl StorageDir {`
`248`	`248`	`paths`
`249`	`249`	`}`
`250`	`250`
	`251`	`+ #[allow(unused)]`
`251`	`252`	`pub fn arrow_files_grouped_by_time(&self) -> HashMap<PathBuf, Vec<PathBuf>> {`
`252`	`253`	`// hashmap <time, vec[paths]>`
`253`	`254`	`let mut grouped_arrow_file: HashMap<PathBuf, Vec<PathBuf>> = HashMap::new();`