@@ -35,22 +35,52 @@ use orc_rust::projection::ProjectionMask;
3535use orc_rust:: ArrowReaderBuilder ;
3636
3737use crate :: chunk_reader_impl:: OrcChunkReader ;
38+ use crate :: hashable_schema:: HashableSchema ;
3839use crate :: strip:: StripeInMemory ;
3940use crate :: utils:: map_orc_error;
4041
42+ pub struct InferredSchema {
43+ arrow_schema : arrow_schema:: SchemaRef ,
44+ schema_from : Option < String > ,
45+ projection : Projection ,
46+ }
47+
48+ impl InferredSchema {
49+ fn check_file_schema ( & self , arrow_schema : arrow_schema:: SchemaRef , path : & str ) -> Result < ( ) > {
50+ if self . arrow_schema . fields != arrow_schema. fields {
51+ return Err ( ErrorCode :: TableSchemaMismatch ( format ! (
52+ "{} get diff schema in file '{}'. Expected schema: {:?}, actual: {:?}" ,
53+ self . schema_from
54+ . as_ref( )
55+ . map_or( String :: new( ) , |schema_from| {
56+ format!( "infer schema from '{}', but " , schema_from)
57+ } ) ,
58+ path,
59+ self . arrow_schema,
60+ arrow_schema
61+ ) ) ) ;
62+ }
63+ Ok ( ( ) )
64+ }
65+ }
66+
67+ pub struct ReadingFile {
68+ path : String ,
69+ stripe_factory : Box < StripeFactory < OrcChunkReader > > ,
70+ size : usize ,
71+ schema : Option < HashableSchema > ,
72+ }
73+
4174pub struct ORCSource {
4275 table_ctx : Arc < dyn TableContext > ,
4376 op_registry : Arc < dyn OperatorRegistry > ,
44- pub ( crate ) reader : Option < ( String , Box < StripeFactory < OrcChunkReader > > , usize ) > ,
77+ pub reader : Option < ReadingFile > ,
4578 scan_progress : Arc < Progress > ,
46-
47- arrow_schema : arrow_schema:: SchemaRef ,
48- schema_from : Option < String > ,
49- projection : Projection ,
79+ inferred_schema : Option < InferredSchema > ,
5080}
5181
5282impl ORCSource {
53- pub fn try_create (
83+ pub fn try_create_with_schema (
5484 output : Arc < OutputPort > ,
5585 table_ctx : Arc < dyn TableContext > ,
5686 op_registry : Arc < dyn OperatorRegistry > ,
@@ -65,27 +95,29 @@ impl ORCSource {
6595 op_registry,
6696 scan_progress,
6797 reader : None ,
68- arrow_schema,
69- schema_from,
70- projection,
98+ inferred_schema : Some ( InferredSchema {
99+ arrow_schema,
100+ schema_from,
101+ projection,
102+ } ) ,
71103 } )
72104 }
73105
74- fn check_file_schema ( & self , arrow_schema : arrow_schema :: SchemaRef , path : & str ) -> Result < ( ) > {
75- if self . arrow_schema . fields != arrow_schema . fields {
76- return Err ( ErrorCode :: TableSchemaMismatch ( format ! (
77- "{}get diff schema in file '{}'. Expected schema: {:?}, actual: {:?}" ,
78- self . schema_from
79- . as_ref ( )
80- . map_or ( String :: new ( ) , |schema_from| {
81- format! ( "infer schema from '{}', but " , schema_from )
82- } ) ,
83- path ,
84- self . arrow_schema ,
85- arrow_schema
86- ) ) ) ;
87- }
88- Ok ( ( ) )
106+ pub fn try_create (
107+ output : Arc < OutputPort > ,
108+ table_ctx : Arc < dyn TableContext > ,
109+ op_registry : Arc < dyn OperatorRegistry > ,
110+ inferred_schema : Option < InferredSchema > ,
111+ ) -> Result < ProcessorPtr > {
112+ let scan_progress = table_ctx . get_scan_progress ( ) ;
113+
114+ AsyncSourcer :: create ( table_ctx . clone ( ) , output , ORCSource {
115+ table_ctx ,
116+ op_registry ,
117+ scan_progress ,
118+ reader : None ,
119+ inferred_schema ,
120+ } )
89121 }
90122
91123 async fn next_part ( & mut self ) -> Result < bool > {
@@ -105,20 +137,32 @@ impl ORCSource {
105137 let builder = ArrowReaderBuilder :: try_new_async ( file)
106138 . await
107139 . map_err ( |e| map_orc_error ( e, path) ) ?;
108- let projection = if let Projection :: Columns ( projection) = & self . projection {
109- ProjectionMask :: roots (
110- builder. file_metadata ( ) . root_data_type ( ) ,
111- projection. iter ( ) . map ( |index| index + 1 ) ,
112- )
113- } else {
114- ProjectionMask :: all ( )
115- } ;
140+ let mut projection = ProjectionMask :: all ( ) ;
141+ if let Some ( schema) = & self . inferred_schema {
142+ if let Projection :: Columns ( p) = & schema. projection {
143+ projection = ProjectionMask :: roots (
144+ builder. file_metadata ( ) . root_data_type ( ) ,
145+ p. iter ( ) . map ( |index| index + 1 ) ,
146+ ) ;
147+ }
148+ }
149+
116150 let reader = builder. with_projection ( projection) . build_async ( ) ;
117151 let ( factory, schema) = reader. into_parts ( ) ;
118- let factory = factory. unwrap ( ) ;
119- self . check_file_schema ( schema, path) ?;
152+ let stripe_factory = factory. unwrap ( ) ;
153+ let schema = if let Some ( inferred_schema) = & self . inferred_schema {
154+ inferred_schema. check_file_schema ( schema, path) ?;
155+ None
156+ } else {
157+ Some ( HashableSchema :: try_create ( schema) ?)
158+ } ;
120159
121- self . reader = Some ( ( path. to_string ( ) , factory, size) ) ;
160+ self . reader = Some ( ReadingFile {
161+ path : path. to_string ( ) ,
162+ stripe_factory,
163+ size,
164+ schema,
165+ } ) ;
122166 Ok ( true )
123167 }
124168}
@@ -134,8 +178,9 @@ impl AsyncSource for ORCSource {
134178 if self . reader . is_none ( ) && !self . next_part ( ) . await ? {
135179 return Ok ( None ) ;
136180 }
137- if let Some ( ( path, factory, size) ) = mem:: take ( & mut self . reader ) {
138- let ( factory, stripe) = factory
181+ if let Some ( file) = mem:: take ( & mut self . reader ) {
182+ let ( factory, stripe) = file
183+ . stripe_factory
139184 . read_next_stripe ( )
140185 . await
141186 . map_err ( |e| ErrorCode :: StorageOther ( e. to_string ( ) ) ) ?;
@@ -144,10 +189,10 @@ impl AsyncSource for ORCSource {
144189 self . reader = None ;
145190 let progress_values = ProgressValues {
146191 rows : 0 ,
147- bytes : size,
192+ bytes : file . size ,
148193 } ;
149194 self . scan_progress . incr ( & progress_values) ;
150- Profile :: record_usize_profile ( ProfileStatisticsName :: ScanBytes , size) ;
195+ Profile :: record_usize_profile ( ProfileStatisticsName :: ScanBytes , file . size ) ;
151196 Profile :: record_usize_profile ( ProfileStatisticsName :: ScanPartitions , 1 ) ;
152197 continue ;
153198 }
@@ -157,11 +202,17 @@ impl AsyncSource for ORCSource {
157202 bytes : 0 ,
158203 } ;
159204 self . scan_progress . incr ( & progress_values) ;
160- self . reader = Some ( ( path . clone ( ) , Box :: new ( factory ) , size ) ) ;
205+
161206 let meta = Box :: new ( StripeInMemory {
162- path,
207+ path : file . path . clone ( ) ,
163208 stripe,
164- schema : None ,
209+ schema : file. schema . clone ( ) ,
210+ } ) ;
211+ self . reader = Some ( ReadingFile {
212+ path : file. path . clone ( ) ,
213+ stripe_factory : Box :: new ( factory) ,
214+ size : file. size ,
215+ schema : file. schema . clone ( ) ,
165216 } ) ;
166217 return Ok ( Some ( DataBlock :: empty_with_meta ( meta) ) ) ;
167218 }
0 commit comments