@@ -30,24 +30,19 @@ use databend_common_pipeline_sources::AsyncSource;
3030use databend_common_pipeline_sources:: AsyncSourcer ;
3131use databend_storages_common_stage:: SingleFilePartition ;
3232use opendal:: Operator ;
33- use orc_rust:: async_arrow_reader:: StripeFactory ;
3433use orc_rust:: ArrowReaderBuilder ;
3534
3635use crate :: chunk_reader_impl:: OrcChunkReader ;
3736use crate :: hashable_schema:: HashableSchema ;
37+ use crate :: processors:: source:: ReadingFile ;
3838use crate :: strip:: StripeInMemory ;
3939use crate :: utils:: map_orc_error;
4040
4141pub struct ORCSourceForCopy {
4242 table_ctx : Arc < dyn TableContext > ,
4343 scan_progress : Arc < Progress > ,
4444 op : Operator ,
45- reader : Option < (
46- String ,
47- Box < StripeFactory < OrcChunkReader > > ,
48- HashableSchema ,
49- usize ,
50- ) > ,
45+ reader : Option < ReadingFile > ,
5146}
5247
5348impl ORCSourceForCopy {
@@ -85,10 +80,16 @@ impl ORCSourceForCopy {
8580 . map_err ( |e| map_orc_error ( e, & path) ) ?;
8681 let reader = builder. build_async ( ) ;
8782 let ( factory, schema) = reader. into_parts ( ) ;
88- let factory = factory. expect ( "factory must has been created" ) ;
83+ let stripe_factory = factory. expect ( "factory must has been created" ) ;
8984 let schema = HashableSchema :: try_create ( schema) ?;
9085
91- self . reader = Some ( ( path, factory, schema, size) ) ;
86+ self . reader = Some ( ReadingFile {
87+ path : path. to_string ( ) ,
88+ stripe_factory,
89+ size,
90+ schema : Some ( schema) ,
91+ rows : 0 ,
92+ } ) ;
9293 Ok ( true )
9394 }
9495}
@@ -105,8 +106,9 @@ impl AsyncSource for ORCSourceForCopy {
105106 return Ok ( None ) ;
106107 }
107108 let start = Instant :: now ( ) ;
108- if let Some ( ( path, factory, schema, size) ) = mem:: take ( & mut self . reader ) {
109- let ( factory, stripe) = factory
109+ if let Some ( file) = mem:: take ( & mut self . reader ) {
110+ let ( factory, stripe) = file
111+ . stripe_factory
110112 . read_next_stripe ( )
111113 . await
112114 . map_err ( |e| ErrorCode :: StorageOther ( e. to_string ( ) ) ) ?;
@@ -118,24 +120,31 @@ impl AsyncSource for ORCSourceForCopy {
118120 }
119121 Some ( stripe) => {
120122 let used = start. elapsed ( ) . as_secs_f32 ( ) ;
123+ let rows = stripe. number_of_rows ( ) ;
124+
121125 let bytes = stripe. stream_map ( ) . inner . values ( ) . map ( |b| b. len ( ) ) . sum ( ) ;
122- let progress_values = ProgressValues {
123- rows : stripe. number_of_rows ( ) ,
124- bytes,
125- } ;
126+ let progress_values = ProgressValues { rows, bytes } ;
126127 Profile :: record_usize_profile ( ProfileStatisticsName :: ScanBytes , bytes) ;
127128 log:: info!(
128- "read new stripe of {} rows and {bytes} bytes from {path }, use {} secs" ,
129+ "read new stripe of {} rows and {bytes} bytes from {}, use {} secs" ,
129130 stripe. number_of_rows( ) ,
131+ file. path,
130132 used
131133 ) ;
132134 self . scan_progress . incr ( & progress_values) ;
133135
134- self . reader = Some ( ( path. clone ( ) , Box :: new ( factory) , schema. clone ( ) , size) ) ;
136+ self . reader = Some ( ReadingFile {
137+ path : file. path . clone ( ) ,
138+ stripe_factory : Box :: new ( factory) ,
139+ size : file. size ,
140+ schema : file. schema . clone ( ) ,
141+ rows : ( rows as u64 ) + file. rows ,
142+ } ) ;
135143 let meta = Box :: new ( StripeInMemory {
136- path,
144+ path : file . path . clone ( ) ,
137145 stripe,
138- schema : Some ( schema) ,
146+ schema : file. schema ,
147+ start_row : file. rows ,
139148 } ) ;
140149 return Ok ( Some ( DataBlock :: empty_with_meta ( meta) ) ) ;
141150 }
0 commit comments