16
16
*
17
17
*/
18
18
19
- use arrow_schema :: SchemaRef ;
20
- use async_trait :: async_trait ;
19
+ mod table_provider ;
20
+
21
21
use chrono:: TimeZone ;
22
22
use chrono:: { DateTime , Utc } ;
23
23
use datafusion:: arrow:: datatypes:: Schema ;
24
- use datafusion:: arrow:: ipc:: reader:: StreamReader ;
25
24
use datafusion:: arrow:: record_batch:: RecordBatch ;
26
- use datafusion:: datasource:: file_format:: parquet:: ParquetFormat ;
27
- use datafusion:: datasource:: listing:: ListingOptions ;
28
- use datafusion:: datasource:: listing:: ListingTable ;
29
- use datafusion:: datasource:: listing:: ListingTableConfig ;
30
- use datafusion:: datasource:: listing:: ListingTableUrl ;
31
- use datafusion:: datasource:: { MemTable , TableProvider } ;
32
- use datafusion:: error:: DataFusionError ;
33
- use datafusion:: execution:: context:: SessionState ;
34
- use datafusion:: logical_expr:: TableType ;
35
- use datafusion:: physical_plan:: union:: UnionExec ;
36
- use datafusion:: physical_plan:: ExecutionPlan ;
25
+ use datafusion:: datasource:: TableProvider ;
37
26
use datafusion:: prelude:: * ;
38
27
use serde_json:: Value ;
39
- use std:: any:: Any ;
40
- use std:: fs:: File ;
41
28
use std:: path:: Path ;
42
29
use std:: path:: PathBuf ;
43
30
use std:: sync:: Arc ;
@@ -50,6 +37,7 @@ use crate::utils::TimePeriod;
50
37
use crate :: validator;
51
38
52
39
use self :: error:: { ExecuteError , ParseError } ;
40
+ use table_provider:: QueryTableProvider ;
53
41
54
42
type Key = & ' static str ;
55
43
fn get_value ( value : & Value , key : Key ) -> Result < & str , Key > {
@@ -110,41 +98,24 @@ impl Query {
110
98
111
99
let parquet_files: Vec < PathBuf > = possible_parquet_files. chain ( parquet_files) . collect ( ) ;
112
100
113
- let mut results = vec ! [ ] ;
114
-
115
- if !( arrow_files. is_empty ( ) && parquet_files. is_empty ( ) ) {
116
- self . execute_on_cache (
117
- arrow_files,
118
- parquet_files,
119
- self . schema . clone ( ) ,
120
- & mut results,
121
- )
122
- . await ?;
123
- }
124
-
125
- storage. query ( self , & mut results) . await ?;
126
- Ok ( results)
127
- }
128
-
129
- async fn execute_on_cache (
130
- & self ,
131
- arrow_files : Vec < PathBuf > ,
132
- parquet_files : Vec < PathBuf > ,
133
- schema : Arc < Schema > ,
134
- results : & mut Vec < RecordBatch > ,
135
- ) -> Result < ( ) , ExecuteError > {
136
- let ctx = SessionContext :: new ( ) ;
137
- let table = Arc :: new ( QueryTableProvider :: new ( arrow_files, parquet_files, schema) ) ;
101
+ let ctx =
102
+ SessionContext :: with_config_rt ( SessionConfig :: default ( ) , storage. query_runtime_env ( ) ) ;
103
+ let table = Arc :: new ( QueryTableProvider :: new (
104
+ arrow_files,
105
+ parquet_files,
106
+ storage. query_table ( self ) ?,
107
+ Arc :: clone ( & self . schema ) ,
108
+ ) ) ;
138
109
ctx. register_table (
139
110
& * self . stream_name ,
140
111
Arc :: clone ( & table) as Arc < dyn TableProvider > ,
141
112
)
142
113
. map_err ( ObjectStorageError :: DataFusionError ) ?;
143
114
// execute the query and collect results
144
115
let df = ctx. sql ( self . query . as_str ( ) ) . await ?;
145
- results. extend ( df. collect ( ) . await ?) ;
116
+ let results = df. collect ( ) . await ?;
146
117
table. remove_preserve ( ) ;
147
- Ok ( ( ) )
118
+ Ok ( results )
148
119
}
149
120
}
150
121
@@ -166,127 +137,6 @@ fn time_from_path(path: &Path) -> DateTime<Utc> {
166
137
. expect ( "valid prefix is parsed" )
167
138
}
168
139
169
- #[ derive( Debug ) ]
170
- struct QueryTableProvider {
171
- arrow_files : Vec < PathBuf > ,
172
- parquet_files : Vec < PathBuf > ,
173
- schema : Arc < Schema > ,
174
- }
175
-
176
- impl QueryTableProvider {
177
- fn new ( arrow_files : Vec < PathBuf > , parquet_files : Vec < PathBuf > , schema : Arc < Schema > ) -> Self {
178
- // By the time this query executes the arrow files could be converted to parquet files
179
- // we want to preserve these files as well in case
180
-
181
- let mut parquet_cached = crate :: storage:: CACHED_FILES . lock ( ) . expect ( "no poisoning" ) ;
182
- for file in & parquet_files {
183
- parquet_cached. upsert ( file)
184
- }
185
-
186
- Self {
187
- arrow_files,
188
- parquet_files,
189
- schema,
190
- }
191
- }
192
-
193
- pub fn remove_preserve ( & self ) {
194
- let mut parquet_cached = crate :: storage:: CACHED_FILES . lock ( ) . expect ( "no poisoning" ) ;
195
- for file in & self . parquet_files {
196
- parquet_cached. remove ( file)
197
- }
198
- }
199
-
200
- pub async fn create_physical_plan (
201
- & self ,
202
- ctx : & SessionState ,
203
- projection : & Option < Vec < usize > > ,
204
- filters : & [ Expr ] ,
205
- limit : Option < usize > ,
206
- ) -> Result < Arc < dyn ExecutionPlan > , DataFusionError > {
207
- let mut mem_records: Vec < Vec < RecordBatch > > = Vec :: new ( ) ;
208
- let mut parquet_files = self . parquet_files . clone ( ) ;
209
- for file in & self . arrow_files {
210
- let Ok ( arrow_file) = File :: open ( file) else { continue ; } ;
211
- let reader = StreamReader :: try_new ( arrow_file, None ) ?;
212
- let records = reader
213
- . filter_map ( |record| match record {
214
- Ok ( record) => Some ( record) ,
215
- Err ( e) => {
216
- log:: warn!( "warning from arrow stream {:?}" , e) ;
217
- None
218
- }
219
- } )
220
- . collect ( ) ;
221
- mem_records. push ( records) ;
222
-
223
- let mut file = file. clone ( ) ;
224
- file. set_extension ( "parquet" ) ;
225
-
226
- parquet_files. retain ( |p| p != & file)
227
- }
228
-
229
- let memtable = MemTable :: try_new ( Arc :: clone ( & self . schema ) , mem_records) ?;
230
- let memexec = memtable. scan ( ctx, projection, filters, limit) . await ?;
231
-
232
- if parquet_files. is_empty ( ) {
233
- Ok ( memexec)
234
- } else {
235
- let listing_options = ListingOptions {
236
- file_extension : ".parquet" . to_owned ( ) ,
237
- format : Arc :: new ( ParquetFormat :: default ( ) . with_enable_pruning ( true ) ) ,
238
- table_partition_cols : vec ! [ ] ,
239
- collect_stat : true ,
240
- target_partitions : 1 ,
241
- } ;
242
-
243
- let paths = parquet_files
244
- . clone ( )
245
- . into_iter ( )
246
- . map ( |path| {
247
- ListingTableUrl :: parse ( path. to_str ( ) . expect ( "path should is valid unicode" ) )
248
- . expect ( "path is valid for filesystem listing" )
249
- } )
250
- . collect ( ) ;
251
-
252
- let config = ListingTableConfig :: new_with_multi_paths ( paths)
253
- . with_listing_options ( listing_options)
254
- . with_schema ( Arc :: clone ( & self . schema ) ) ;
255
-
256
- let listtable = ListingTable :: try_new ( config) . unwrap ( ) ;
257
- let listexec = listtable. scan ( ctx, projection, filters, limit) . await ?;
258
-
259
- Ok ( Arc :: new ( UnionExec :: new ( vec ! [ memexec, listexec] ) ) )
260
- }
261
- }
262
- }
263
-
264
- #[ async_trait]
265
- impl TableProvider for QueryTableProvider {
266
- fn as_any ( & self ) -> & dyn Any {
267
- self
268
- }
269
-
270
- fn schema ( & self ) -> SchemaRef {
271
- Arc :: clone ( & self . schema )
272
- }
273
-
274
- fn table_type ( & self ) -> TableType {
275
- TableType :: Base
276
- }
277
-
278
- async fn scan (
279
- & self ,
280
- ctx : & SessionState ,
281
- projection : & Option < Vec < usize > > ,
282
- filters : & [ Expr ] ,
283
- limit : Option < usize > ,
284
- ) -> datafusion:: error:: Result < Arc < dyn ExecutionPlan > > {
285
- self . create_physical_plan ( ctx, projection, filters, limit)
286
- . await
287
- }
288
- }
289
-
290
140
pub mod error {
291
141
use datafusion:: error:: DataFusionError ;
292
142
0 commit comments