@@ -34,6 +34,7 @@ use common_arrow::parquet::metadata::ColumnChunkMetaData;
3434use common_arrow:: parquet:: metadata:: RowGroupMetaData ;
3535use common_arrow:: parquet:: read:: read_metadata;
3636use common_arrow:: read_columns_async;
37+ use common_base:: runtime:: execute_futures_in_parallel;
3738use common_catalog:: plan:: StageFileInfo ;
3839use common_exception:: ErrorCode ;
3940use common_exception:: Result ;
@@ -63,18 +64,10 @@ use crate::input_formats::SplitInfo;
6364
6465pub struct InputFormatParquet ;
6566
66- fn col_offset ( meta : & ColumnChunkMetaData ) -> i64 {
67- meta. data_page_offset ( )
68- }
69-
70- #[ async_trait:: async_trait]
71- impl InputFormat for InputFormatParquet {
72- async fn get_splits (
73- & self ,
67+ impl InputFormatParquet {
68+ async fn get_split_batch (
7469 file_infos : Vec < StageFileInfo > ,
75- _stage_info : & StageInfo ,
76- op : & Operator ,
77- _settings : & Arc < Settings > ,
70+ op : Operator ,
7871 ) -> Result < Vec < Arc < SplitInfo > > > {
7972 let mut infos = vec ! [ ] ;
8073 let mut schema = None ;
@@ -124,8 +117,56 @@ impl InputFormat for InputFormatParquet {
124117 }
125118 }
126119 }
120+
127121 Ok ( infos)
128122 }
123+ }
124+
125+ fn col_offset ( meta : & ColumnChunkMetaData ) -> i64 {
126+ meta. data_page_offset ( )
127+ }
128+
129+ #[ async_trait:: async_trait]
130+ impl InputFormat for InputFormatParquet {
131+ async fn get_splits (
132+ & self ,
133+ file_infos : Vec < StageFileInfo > ,
134+ _stage_info : & StageInfo ,
135+ op : & Operator ,
136+ _settings : & Arc < Settings > ,
137+ ) -> Result < Vec < Arc < SplitInfo > > > {
138+ let batch_size = 1000 ;
139+
140+ if file_infos. len ( ) <= batch_size {
141+ Self :: get_split_batch ( file_infos, op. clone ( ) ) . await
142+ } else {
143+ let mut chunks = file_infos. chunks ( batch_size) ;
144+
145+ let tasks = std:: iter:: from_fn ( move || {
146+ chunks
147+ . next ( )
148+ . map ( |location| Self :: get_split_batch ( location. to_vec ( ) , op. clone ( ) ) )
149+ } ) ;
150+
151+ // TODO: Get from ctx.
152+ let thread_nums = 16 ;
153+ let permit_nums = 64 ;
154+ let result = execute_futures_in_parallel (
155+ tasks,
156+ thread_nums,
157+ permit_nums,
158+ "get-parquet-splits-worker" . to_owned ( ) ,
159+ )
160+ . await ?
161+ . into_iter ( )
162+ . collect :: < Result < Vec < Vec < _ > > > > ( ) ?
163+ . into_iter ( )
164+ . flatten ( )
165+ . collect ( ) ;
166+
167+ Ok ( result)
168+ }
169+ }
129170
130171 async fn infer_schema ( & self , path : & str , op : & Operator ) -> Result < TableSchemaRef > {
131172 let mut reader = op. reader ( path) . await ?;
0 commit comments